In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [64]:
def clean_dataset(dataset):
    for i in range(dataset.columns.size):
        if (dataset[dataset.columns[i]].isnull().sum() and float(dataset[dataset.columns[i]].isnull().sum()) < dataset.shape[0]/90):
            dataset.dropna(subset=[dataset.columns[i]], inplace=True)

    dataset['Высота потолков, м'].fillna(2.65, inplace=True)
    dataset['Тип этажа'].fillna(
        dataset['Тип этажа'].mode().iloc[0], inplace=True)
    dataset['Состояние ремонта'].fillna('Нет', inplace=True)
    dataset.loc[dataset['Количество комнат']
                == 'Студия', 'Количество комнат'] = 0
    dataset['Дата снятия (продажи)'].fillna(np.nan, inplace=True)
    dataset['Школа\nм'].fillna(
        dataset['Школа\nм'].median(axis=0), inplace=True)
    dataset['Детский сад\nм'].fillna(
        dataset['Детский сад\nм'].median(axis=0), inplace=True)

    return dataset

In [65]:
def create_data(dataset):
    for i in dataset.select_dtypes(include=[object]):
        if (dataset[i].unique().size > 2):
            dataset[i] = dataset[i].astype('category')
    dataset['Количество комнат'] = dataset['Количество комнат'].astype('int64')
    binary_columns = [
        i for i in dataset.columns if dataset[i].dtype.name == 'object']
    categorical_columns = [
        i for i in dataset.columns if dataset[i].dtype.name == 'category']
    numerical_columns = [i for i in dataset.columns if (
        dataset[i].dtype.name != 'category' and dataset[i].dtype.name != 'object')]

    binary_columns = ['Тип рынка', 'Парковка',
                      'Актуальность на дату послед. проверки']
    include_binary_columns = ['Тип рынка', 'Парковка']
    binary_columns = include_binary_columns
    data_binary = dataset[binary_columns]
    data_binary = data_binary.apply(lambda x: pd.factorize(x)[0])

    numerical_columns = ['Общая площадь,\nкв.м', 'Жилая площадь,\nкв.м', 'Площадь кухни,\nкв.м', 'Высота потолков, м', 'Год постройки', 'Этаж', 'Этажность', 'Количество комнат', 'Цена предложения,\nруб.',
                         'Удельная цена, руб./кв.м', 'Колич.  Просмотр.', 'Колич.  Просм. в день', 'Остановка\nм', 'Парк\nм', 'Центр\nкм', 'Станция метро\nм', 'Школа\nм', 'Детский сад\nм']
    include_numerical_columns = ['Общая площадь,\nкв.м', 'Высота потолков, м', 'Этаж',
                                 'Остановка\nм', 'Парк\nм', 'Центр\nкм', 'Станция метро\nм', 'Школа\nм', 'Детский сад\nм']
    numerical_columns = include_numerical_columns
    data_numerical = dataset[numerical_columns]

    categorical_columns = ['Функциональная зона', 'Тип дома', 'Класс', 'Тип этажа', 'Состояние ремонта',
                           'Материал стен', 'Дата создания', 'Дата парсинга', 'Дата послед. проверки', 'Дата снятия (продажи)']
    include_categorical_columns = ['Тип дома',
                                   'Класс', 'Тип этажа', 'Материал стен']
    categorical_columns = include_categorical_columns
    data_categorical = pd.get_dummies(
        dataset[categorical_columns], drop_first=True)

    data = pd.concat((data_binary, data_categorical, data_numerical,
                     dataset['Удельная цена, руб./кв.м']), axis=1)
    return data


### Те же действия что и раньше

In [66]:
df = pd.read_excel('данные_квартиры.xlsx',
                   usecols=lambda x: 'Unnamed' not in x, skiprows=1)
dataset = clean_dataset(df)
data = create_data(dataset)

In [67]:
from sklearn.model_selection import train_test_split

X = data.drop('Удельная цена, руб./кв.м', axis = 1)
y = data['Удельная цена, руб./кв.м']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2057, 22), (515, 22), (2057,), (515,))

### Линейная регрессия

In [68]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [69]:
coeff_linear = pd.DataFrame(regressor.coef_, X_train.columns, columns=['Coefficient'])
coeff_linear

Unnamed: 0,Coefficient
Тип рынка,-3262.053
Парковка,1616.165
Тип дома_Советская до 5 этажей вкл,-11354.03
Тип дома_Современные,24501.51
Тип дома_Старый фонд,-6514.173
Класс_Комфорт,6754.494
Класс_Эконом,3987.665
Класс_Элитный,1.76442e-10
Тип этажа_Последний,-2736.533
Тип этажа_Средний,4592.423


In [70]:
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(y.shape, y_pred.shape)
df.head(10)

(2572,) (515,)


Unnamed: 0,Actual,Predicted
847,93750,119390.699866
2354,132203,133474.041799
943,104103,100056.269953
620,226788,153801.025562
1817,133829,130037.318544
655,111429,97486.443652
718,155000,155272.797286
1326,150897,113700.625578
399,102686,86537.307508
1830,82048,97813.522747


Считаю скорректированный коэффициент детерминации по этой формуле

In [71]:
def adjR(X, y):
    return 1 - (1-regressor.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1)


In [72]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Медиана абсолютной модели:', metrics.mean_absolute_error(y_test, y_pred)/y.mean())
print('Adjusted R-squared:', 1 - (1-regressor.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1))

Mean Absolute Error: 21120.396936656536
Mean Squared Error: 954682527.7470396
Root Mean Squared Error: 30897.93727333654
Медиана абсолютной модели: 0.17125993961361366
Adjusted R-squared: 0.3111820360734059


### Полиномиальная регрессия

In [73]:
from sklearn.preprocessing import PolynomialFeatures

poly=PolynomialFeatures(degree=2)

pf_train = poly.fit_transform(X_train)
pf_test = poly.fit_transform(X_test)

X = pf_train
y = y_train

pfregressor = LinearRegression()
pfregressor.fit(pf_train, y_train)

LinearRegression()

In [74]:
y_pred = pfregressor.predict(pf_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(y_test.shape, y_pred.shape)
df.head(10)

(515,) (515,)


Unnamed: 0,Actual,Predicted
847,93750,108373.509302
2354,132203,142241.44254
943,104103,104376.601827
620,226788,138320.742056
1817,133829,129305.716628
655,111429,134015.215828
718,155000,163326.864043
1326,150897,107123.20409
399,102686,99516.081356
1830,82048,100961.411273


In [75]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Медиана абсолютной модели:', metrics.mean_absolute_error(y_test, y_pred)/y.mean())
print('Adjusted R-squared:', 1 - (1-pfregressor.score(X, y))*(len(y)-1)/(len(y)-X.shape[1]-1))

Mean Absolute Error: 29761.76646487278
Mean Squared Error: 22302825975.056732
Root Mean Squared Error: 149341.30699527418
Медиана абсолютной модели: 0.24053130198133904
Adjusted R-squared: 0.40508266036540297


In [88]:
pfregressor.coef_

array([-2.29577542e-02,  4.55971509e+04,  5.13496580e+04,  8.65289458e+04,
        1.94515410e+05, -2.21656862e+04, -1.12331186e+02,  1.13977718e+02,
       -4.78810714e-03, -9.81867951e+04,  1.84546717e+04,  6.39254112e+04,
        1.59244042e+05,  8.81259237e+03, -6.26783955e+03, -4.66875941e+05,
        3.72969152e+04,  1.47718920e+02,  6.41735021e+01,  1.72621754e+04,
       -3.11921005e+02, -1.37578059e+03, -1.98743285e+01,  4.55971354e+04,
        9.55596571e+03, -2.43520916e+04, -2.17296258e+04, -3.31968744e-04,
        9.47322470e+01,  4.55040881e+04, -2.01821405e-04,  2.52006105e+04,
        1.88947700e+04,  3.30224800e+04,  1.99518115e+03,  1.05794750e+04,
        4.47263354e+02, -7.66940703e+04,  6.87129972e+02, -1.65914416e+01,
       -2.10957967e+00,  7.85145237e+02, -3.18690885e+00,  1.65964022e+01,
        4.18647084e+01,  5.13496590e+04, -3.87272619e+03, -6.70682276e+03,
       -2.77625434e+04, -5.12181013e+01,  5.14025616e+04, -7.90924915e-07,
        5.89494355e+03,  

In [87]:
coeff_linear = pd.DataFrame(pf_test, columns = poly.get_feature_names_out())
coeff_linear

Unnamed: 0,1,Тип рынка,Парковка,Тип дома_Советская до 5 этажей вкл,Тип дома_Современные,Тип дома_Старый фонд,Класс_Комфорт,Класс_Эконом,Класс_Элитный,Тип этажа_Последний,...,Центр\nкм^2,Центр\nкм Станция метро\nм,Центр\nкм Школа\nм,Центр\nкм Детский сад\nм,Станция метро\nм^2,Станция метро\nм Школа\nм,Станция метро\nм Детский сад\nм,Школа\nм^2,Школа\nм Детский сад\nм,Детский сад\nм^2
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,49.0,812.0,2331.0,2996.0,13456.0,38628.0,49648.0,110889.0,142524.0,183184.0
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1137.0,173.0,139.0,1292769.0,196701.0,158043.0,29929.0,24047.0,19321.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,144.0,61068.0,1488.0,5904.0,25897921.0,631036.0,2503788.0,15376.0,61008.0,242064.0
3,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,49.0,39284.0,2107.0,1708.0,31494544.0,1689212.0,1369328.0,90601.0,73444.0,59536.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,16.0,7532.0,2552.0,1768.0,3545689.0,1201354.0,832286.0,407044.0,281996.0,195364.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,225.0,31650.0,3870.0,3795.0,4452100.0,544380.0,533830.0,66564.0,65274.0,64009.0
511,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,25.0,22695.0,2670.0,2830.0,20602521.0,2423826.0,2569074.0,285156.0,302244.0,320356.0
512,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,49.0,37604.0,14455.0,2415.0,28858384.0,11093180.0,1853340.0,4264225.0,712425.0,119025.0
513,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,49.0,39928.0,2107.0,1708.0,32535616.0,1716904.0,1391776.0,90601.0,73444.0,59536.0
