In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from typing import List
from scipy import stats

from sklearn.metrics import SCORERS
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

from collections import defaultdict

from IPython.core.display import display, HTML

In [2]:
flats = pd.read_csv('FLATS.csv')

flats.drop(columns=['Id'], inplace=True)
flats.rename(columns=dict(zip(flats.columns.tolist(), map(str.lower, flats.columns.tolist()))), inplace=True)

print(f'Количество отсутствующих значений: {flats.isna().sum().sum()}')

flats

Количество отсутствующих значений: 0


Unnamed: 0,square_meters,metro_distance,price
0,28,677,7328400
1,54,120,9754600
2,43,1344,6781500
3,33,1003,6784100
4,30,591,7296200
...,...,...,...
95,28,2073,4826700
96,63,2120,6800800
97,33,1411,6517500
98,47,1715,6812100


In [3]:
metric_names = ['r2', 'max_error', 'neg_mean_absolute_error', 'neg_root_mean_squared_error']

In [4]:
def create_and_fit_models(columns_to_be_used: List[List[str]], X: pd.DataFrame, y: pd.DataFrame) -> List[LinearRegression]:
    models = []
    for columns in columns_to_be_used:
        model = LinearRegression()
        model.fit(X[columns], y)
        models.append(model)
    return models

In [5]:
def score_models(models: List[LinearRegression]) -> pd.DataFrame:
    scores_by_feature_names = defaultdict(list)
    for model in models:
        feature_names = model.feature_names_in_
        for metric_name in metric_names:
            scores_by_feature_names[' + '.join(feature_names)].append(SCORERS[metric_name](model, X[feature_names], y))
    return pd.DataFrame(scores_by_feature_names, index=metric_names).T

In [6]:
def display_model_coefs(models: List[LinearRegression]) -> pd.DataFrame:
    for model in models:
        features = model.feature_names_in_
        display(HTML(f'<h3>{" + ".join(features)}</h3>'))
        display(HTML(regression_coef(model, X[features], y).style.format({'coef': '{:.4f}', 'pvalue': '{:e}'}).to_html()))

In [7]:
def perform_cross_validation(columns_to_be_used: List[List[str]], X: pd.DataFrame, y: pd.DataFrame, cv: int = 5):
    val_scores = []
    for column in columns_to_be_used:
        model = LinearRegression()
        val_scores.append(cross_validate(model, X[column], y, scoring=metric_names, cv=cv))

    scores_by_feature_names = defaultdict(list)
    for columns, split_scores in zip(columns_to_be_used, val_scores):
        for metric_name in metric_names:
            split_scores[f'test_{metric_name}'] = np.mean(split_scores[f'test_{metric_name}'])
            scores_by_feature_names[' + '.join(columns)].append(split_scores[f'test_{metric_name}'])

    return pd.DataFrame(scores_by_feature_names, index=metric_names).T

In [8]:
def regression_coef(model, X, y):
    coef = pd.DataFrame(zip(['intercept'] + X.columns.tolist(), [model.intercept_] + model.coef_.tolist()),
                    columns=['predictor', 'coef'])
    
    X1 = np.append(np.ones((len(X),1)), X, axis=1)
    b = np.append(model.intercept_, model.coef_)
    MSE = np.sum((model.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
    var_b = MSE * (np.linalg.inv(np.dot(X1.T, X1)).diagonal())
    sd_b = np.sqrt(var_b)
    t = b / sd_b

    coef['pvalue'] = [2 * (1 - stats.t.cdf(np.abs(i), (len(X1) - 1))) for i in t]
    coef.set_index(['predictor'], inplace=True)
    coef.columns.name = coef.index.name
    coef.index.name = None

    return coef

# Модель на основе площади квартиры

In [9]:
fig = px.scatter(flats, x='square_meters', y='price')
fig.update_layout(xaxis_title='Площадь (м²)', yaxis_title='Цена (₽)', title='Зависимость цены квартиры от площади')
fig.show()

Unsupported

In [10]:
poly = PolynomialFeatures(degree=3, include_bias=False)
features = poly.fit_transform(flats[['square_meters']])

X = pd.DataFrame(features, columns=poly.get_feature_names_out())
y = flats['price']

pd.concat([X, y], axis=1)

Unnamed: 0,square_meters,square_meters^2,square_meters^3,price
0,28.0,784.0,21952.0,7328400
1,54.0,2916.0,157464.0,9754600
2,43.0,1849.0,79507.0,6781500
3,33.0,1089.0,35937.0,6784100
4,30.0,900.0,27000.0,7296200
...,...,...,...,...
95,28.0,784.0,21952.0,4826700
96,63.0,3969.0,250047.0,6800800
97,33.0,1089.0,35937.0,6517500
98,47.0,2209.0,103823.0,6812100


In [11]:
fig = px.imshow(pd.concat([X, y], axis=1).corr())
fig.update_layout(title='Корреляция между переменными')
fig.show()

Unsupported

In [12]:
columns_to_be_used = [['square_meters'], ['square_meters^2'], ['square_meters^3']]
models = create_and_fit_models(columns_to_be_used, X, y)

scores = score_models(models)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters,0.452221,-2184374.821151,-990261.942495,-1141985.12729
square_meters^2,0.423938,-2214253.339918,-1012542.378197,-1171095.416708
square_meters^3,0.385331,-2360068.377651,-1040034.08751,-1209702.074233


In [13]:
display_model_coefs(models)

predictor,coef,pvalue
intercept,5799067.1327,0.0
square_meters^3,10.1322,4.377831e-12


In [14]:
scores = perform_cross_validation(columns_to_be_used, X, y, cv=3)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters,0.387025,-2203037.197201,-1022485.924633,-1179149.370456
square_meters^2,0.35456,-2232469.501254,-1053052.117444,-1210735.043982
square_meters^3,0.310454,-2447245.063804,-1086208.955876,-1251682.080897


In [15]:
area_model = LinearRegression()
area_model.fit(X[['square_meters']], y)

fig = go.Figure()
fig.add_scatter(x=flats['square_meters'], y=flats['price'], mode='markers', name='Исходные данные')
fig.add_scatter(x=X['square_meters'], y=area_model.predict(X[['square_meters']]), name='Модель')
fig.update_layout(xaxis_title='Площадь (м²)', yaxis_title='Цена (₽)', title='Зависимость цены квартиры от площади')
fig.show()

Unsupported

# Модель на основе расстояния до метро

In [16]:
fig = px.scatter(flats, x='metro_distance', y='price')
fig.update_layout(xaxis_title='Расстояние до метро (м)', yaxis_title='Цена (₽)',
                  title='Зависимость цены квартиры от расстояния до метро')
fig.show()

Unsupported

In [17]:
poly = PolynomialFeatures(degree=3, include_bias=False)
features = poly.fit_transform(flats[['metro_distance']])

X = pd.DataFrame(features, columns=poly.get_feature_names_out())
y = flats['price']

pd.concat([X, y], axis=1)

Unnamed: 0,metro_distance,metro_distance^2,metro_distance^3,price
0,677.0,458329.0,3.102887e+08,7328400
1,120.0,14400.0,1.728000e+06,9754600
2,1344.0,1806336.0,2.427716e+09,6781500
3,1003.0,1006009.0,1.009027e+09,6784100
4,591.0,349281.0,2.064251e+08,7296200
...,...,...,...,...
95,2073.0,4297329.0,8.908363e+09,4826700
96,2120.0,4494400.0,9.528128e+09,6800800
97,1411.0,1990921.0,2.809190e+09,6517500
98,1715.0,2941225.0,5.044201e+09,6812100


In [18]:
fig = px.imshow(pd.concat([X, y], axis=1).corr())
fig.update_layout(title='Корреляция между переменными')
fig.show()

Unsupported

In [19]:
columns_to_be_used = [['metro_distance'], ['metro_distance^2'], ['metro_distance^3']]
models = create_and_fit_models(columns_to_be_used, X, y)

scores = score_models(models)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
metro_distance,0.577598,-1794925.935491,-870605.15841,-1002814.034524
metro_distance^2,0.545542,-1929858.297767,-902276.593923,-1040170.556806
metro_distance^3,0.497292,-2075765.780396,-946335.752795,-1093995.275219


In [20]:
display_model_coefs(models)

predictor,coef,pvalue
intercept,7810197.3189,0.0
metro_distance^3,-0.0002,2.220446e-16


In [21]:
scores = perform_cross_validation(columns_to_be_used, X, y, cv=3)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
metro_distance,0.556349,-1820263.978298,-878291.06502,-1009740.654692
metro_distance^2,0.523045,-1885019.837535,-904803.022129,-1046705.77634
metro_distance^3,0.471662,-2048985.89289,-946618.61397,-1101412.971701


In [22]:
metro_model = LinearRegression()
metro_model.fit(X[['metro_distance']], y)

fig = go.Figure()
fig.add_scatter(x=flats['metro_distance'], y=flats['price'], mode='markers', name='Исходные данные')
fig.add_scatter(x=X['metro_distance'], y=metro_model.predict(X[['metro_distance']]), name='Модель')
fig.update_layout(xaxis_title='Расстояние до метро (м)', yaxis_title='Цена (₽)',
                  title='Зависимость цены квартиры от расстояния до метро')
fig.show()

Unsupported

# Модель на основе площади квартиры и расстояния до метро

In [23]:
fig = px.scatter_3d(flats, x='square_meters', y='metro_distance', z='price')
fig.update_layout(scene={'xaxis_title': 'Площадь (м²)', 'yaxis_title': 'Расстояние до метро (м)', 'zaxis_title': 'Цена (₽)'},
                  title='Зависимость цены квартиры от площади и расстояния до метро')
fig.show()

Unsupported

In [24]:
poly = PolynomialFeatures(degree=2, include_bias=False)
features = poly.fit_transform(flats[['square_meters', 'metro_distance']])

X = pd.DataFrame(features, columns=poly.get_feature_names_out())
y = flats['price']

pd.concat([X, y], axis=1)

Unnamed: 0,square_meters,metro_distance,square_meters^2,square_meters metro_distance,metro_distance^2,price
0,28.0,677.0,784.0,18956.0,458329.0,7328400
1,54.0,120.0,2916.0,6480.0,14400.0,9754600
2,43.0,1344.0,1849.0,57792.0,1806336.0,6781500
3,33.0,1003.0,1089.0,33099.0,1006009.0,6784100
4,30.0,591.0,900.0,17730.0,349281.0,7296200
...,...,...,...,...,...,...
95,28.0,2073.0,784.0,58044.0,4297329.0,4826700
96,63.0,2120.0,3969.0,133560.0,4494400.0,6800800
97,33.0,1411.0,1089.0,46563.0,1990921.0,6517500
98,47.0,1715.0,2209.0,80605.0,2941225.0,6812100


In [25]:
fig = px.imshow(pd.concat([X, y], axis=1).corr())
fig.update_layout(title='Корреляция между переменными')

Unsupported

In [26]:
columns_to_be_used = [['square_meters', 'metro_distance'],
                      ['square_meters', 'metro_distance', 'square_meters metro_distance'],
                      ['square_meters^2', 'metro_distance^2'],
                      ['square_meters^2', 'metro_distance^2', 'square_meters metro_distance']]
models = create_and_fit_models(columns_to_be_used, X, y)

scores = score_models(models)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters + metro_distance,0.977458,-468912.499566,-188069.835656,-231660.844156
square_meters + metro_distance + square_meters metro_distance,0.97748,-477542.734832,-187830.203066,-231550.11767
square_meters^2 + metro_distance^2,0.942989,-981051.723072,-305514.830113,-368415.90121
square_meters^2 + metro_distance^2 + square_meters metro_distance,0.947349,-965513.603033,-287539.055507,-354047.900904


In [27]:
display_model_coefs(models)

predictor,coef,pvalue
intercept,6648850.4674,0.0
square_meters^2,815.4056,0.0
metro_distance^2,-0.4958,0.0
square_meters metro_distance,-6.7275,0.005568906


In [28]:
scores = perform_cross_validation(columns_to_be_used, X, y, cv=3)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters + metro_distance,0.975159,-486071.148972,-198798.078751,-239727.152503
square_meters + metro_distance + square_meters metro_distance,0.974325,-510081.774306,-201892.737591,-243618.326519
square_meters^2 + metro_distance^2,0.936035,-823993.014414,-315148.250434,-379047.825932
square_meters^2 + metro_distance^2 + square_meters metro_distance,0.938875,-732715.320577,-307162.295228,-371181.383373


In [29]:
area_metro_model = LinearRegression()
area_metro_model.fit(X[['square_meters', 'metro_distance']], y)

fig = go.Figure()
fig.add_scatter3d(x=flats['square_meters'], y=flats['metro_distance'], z=flats['price'], mode='markers',
                  name='Исходные данные')
fig.add_mesh3d(x=X['square_meters'], y=X['metro_distance'],
               z=area_metro_model.predict(X[['square_meters', 'metro_distance']]), name='Модель')
fig.update_layout(
    scene={'xaxis_title': 'Площадь (м²)', 'yaxis_title': 'Расстояние до метро (м)', 'zaxis_title': 'Цена (₽)'},
    title='Зависимость цены квартиры от площади и расстояния до метро', showlegend=True)
fig.show()

Unsupported

# Результат

In [30]:
poly = PolynomialFeatures(degree=2, include_bias=False)
features = poly.fit_transform(flats[['square_meters', 'metro_distance']])

X = pd.DataFrame(features, columns=poly.get_feature_names_out())
y = flats['price']

In [31]:
columns_to_be_used = [['square_meters'], ['metro_distance'], ['square_meters', 'metro_distance']]
models = create_and_fit_models(columns_to_be_used, X, y)

scores = score_models(models)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters,0.452221,-2184374.821151,-990261.942495,-1141985.12729
metro_distance,0.577598,-1794925.935491,-870605.15841,-1002814.034524
square_meters + metro_distance,0.977458,-468912.499566,-188069.835656,-231660.844156


In [32]:
display_model_coefs(models)

predictor,coef,pvalue
intercept,6496840.1066,0.0
square_meters,58525.5431,0.0
metro_distance,-1603.4599,0.0


In [33]:
scores = perform_cross_validation(columns_to_be_used, X, y, cv=3)
scores.style.highlight_max()

Unnamed: 0,r2,max_error,neg_mean_absolute_error,neg_root_mean_squared_error
square_meters,0.387025,-2203037.197201,-1022485.924633,-1179149.370456
metro_distance,0.556349,-1820263.978298,-878291.06502,-1009740.654692
square_meters + metro_distance,0.975159,-486071.148972,-198798.078751,-239727.152503
