In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('cleaned_data.csv', sep='=')

In [3]:
data.head()

Unnamed: 0,district,total_area,living_area,kitchen_area,description,price,rooms,level,first,last,max_level
0,Правобережный,42.1,27.0,6.0,"Район с развитой инфраструктурой, рядом ""Парк ...",3050.0,2,4,0,0,5
1,Орджоникидзевский,49.2,25.0,13.0,"Продам квартиру в отличном состоянии, лоджия з...",3800.0,1,10,0,1,10
2,Правобережный,22.0,12.0,6.0,"Продаётся однокомнатная квартира ""малосемейка""...",1800.0,1,4,0,0,9
3,Ленинский,30.0,18.2,5.6,Продам 1 комнатную квартиру в Ленинском районе...,2200.0,1,5,0,1,5
4,Правобережный,53.0,35.0,9.0,id:30409. Продается двухкомнатная квартира у...,4100.0,2,1,1,0,9


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   district      364 non-null    object 
 1   total_area    364 non-null    float64
 2   living_area   364 non-null    float64
 3   kitchen_area  364 non-null    float64
 4   description   357 non-null    object 
 5   price         364 non-null    float64
 6   rooms         364 non-null    int64  
 7   level         364 non-null    int64  
 8   first         364 non-null    int64  
 9   last          364 non-null    int64  
 10  max_level     364 non-null    int64  
dtypes: float64(4), int64(5), object(2)
memory usage: 31.4+ KB


In [5]:
data.groupby('district').nunique()

Unnamed: 0_level_0,total_area,living_area,kitchen_area,description,price,rooms,level,first,last,max_level
district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Ленинский,77,46,22,101,76,4,9,2,2,9
Ленинский (левый берег),4,4,3,4,4,4,4,2,2,3
Орджоникидзевский,94,60,26,158,105,4,12,2,2,12
Орджоникидзевский (левый берег),18,13,9,21,19,5,4,2,2,5
Правобережный,59,38,15,71,58,4,11,2,2,11


In [6]:
data.describe()

Unnamed: 0,total_area,living_area,kitchen_area,price,rooms,level,first,last,max_level
count,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0
mean,55.167967,35.475824,8.711786,3791.137363,2.228022,3.953297,0.203297,0.244505,6.60989
std,17.542958,13.456415,3.325647,1537.192741,0.849687,2.64742,0.403005,0.430385,3.299806
min,21.8,9.0,2.0,450.0,1.0,1.0,0.0,0.0,1.0
25%,43.03,27.7,6.0,2750.0,2.0,2.0,0.0,0.0,4.0
50%,53.15,33.8,8.0,3555.0,2.0,4.0,0.0,0.0,5.0
75%,65.125,45.0,9.0,4592.5,3.0,5.0,0.0,0.0,9.0
max,114.0,83.0,25.0,9500.0,5.0,14.0,1.0,1.0,17.0


# Построение моделей

Для построения моделей отбрасываем признаки описания и общей площади квартиры.
Данные разбиваем на 3 части: train, val и test.

In [7]:
from sklearn.model_selection import train_test_split
y = data['price']
X = data.drop(['price', 'description', 'total_area'], axis=1)
train_x, test_val_x, train_y, test_val_y = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=data['district'])
val_x, test_x, val_y, test_y = train_test_split(test_val_x, test_val_y,
                                                    test_size=0.5,
                                                    random_state=42)

In [8]:
cat_columns = ['rooms', 'district']

In [9]:
from sklearn.preprocessing import OneHotEncoder

coder_rooms = OneHotEncoder(sparse_output=False)
coder_rooms.fit(np.array(train_x['rooms']).reshape(-1, 1))

train_x = pd.concat([train_x, pd.DataFrame(coder_rooms.transform(np.array(train_x['rooms']).reshape(-1, 1)),
                                           columns=coder_rooms.get_feature_names_out(['rooms']),
                                          index=train_x.index)],
                     axis=1)
train_x = train_x.drop(['rooms'], axis=1)

test_x = pd.concat([test_x, pd.DataFrame(coder_rooms.transform(np.array(test_x['rooms']).reshape(-1, 1)),
                                        columns=coder_rooms.get_feature_names_out(['rooms']),
                                        index=test_x.index)],
                     axis=1)
test_x = test_x.drop(['rooms'], axis=1)

val_x = pd.concat([val_x, pd.DataFrame(coder_rooms.transform(np.array(val_x['rooms']).reshape(-1, 1)),
                                        columns=coder_rooms.get_feature_names_out(['rooms']),
                                        index=val_x.index)],
                     axis=1)
val_x = val_x.drop(['rooms'], axis=1)
# dump(coder_rooms, 'rooms_coder.joblib')

In [10]:
coder_district = OneHotEncoder(sparse_output=False)
coder_district.fit(np.array(train_x['district']).reshape(-1, 1))

train_x = pd.concat([train_x, pd.DataFrame(coder_district.transform(np.array(train_x['district']).reshape(-1, 1)),
                                           columns=coder_district.get_feature_names_out(['district']),
                                          index=train_x.index)],
                     axis=1)
train_x = train_x.drop(['district'], axis=1)

test_x = pd.concat([test_x, pd.DataFrame(coder_district.transform(np.array(test_x['district']).reshape(-1, 1)),
                                        columns=coder_district.get_feature_names_out(['district']),
                                        index=test_x.index)],
                     axis=1)
test_x = test_x.drop(['district'], axis=1)

val_x = pd.concat([val_x, pd.DataFrame(coder_district.transform(np.array(val_x['district']).reshape(-1, 1)),
                                        columns=coder_district.get_feature_names_out(['district']),
                                        index=val_x.index)],
                     axis=1)
val_x = val_x.drop(['district'], axis=1)
# dump(coder_district, 'district_coder.joblib')

In [11]:
train_x

Unnamed: 0,living_area,kitchen_area,level,first,last,max_level,rooms_1,rooms_2,rooms_3,rooms_4,rooms_5,district_Ленинский,district_Ленинский (левый берег),district_Орджоникидзевский,district_Орджоникидзевский (левый берег),district_Правобережный
193,17.0,6.0,6,0,0,9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
156,45.0,6.0,5,0,1,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
55,41.0,6.2,2,0,0,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
231,41.3,6.3,2,0,0,5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
238,55.0,13.0,6,0,0,10,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,60.0,12.0,2,0,0,16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
221,43.0,8.0,5,0,1,5,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
105,21.0,6.0,3,0,0,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
135,20.0,9.0,8,0,0,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
data.describe()

Unnamed: 0,total_area,living_area,kitchen_area,price,rooms,level,first,last,max_level
count,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0
mean,55.167967,35.475824,8.711786,3791.137363,2.228022,3.953297,0.203297,0.244505,6.60989
std,17.542958,13.456415,3.325647,1537.192741,0.849687,2.64742,0.403005,0.430385,3.299806
min,21.8,9.0,2.0,450.0,1.0,1.0,0.0,0.0,1.0
25%,43.03,27.7,6.0,2750.0,2.0,2.0,0.0,0.0,4.0
50%,53.15,33.8,8.0,3555.0,2.0,4.0,0.0,0.0,5.0
75%,65.125,45.0,9.0,4592.5,3.0,5.0,0.0,0.0,9.0
max,114.0,83.0,25.0,9500.0,5.0,14.0,1.0,1.0,17.0


Шкалируем данные.

In [13]:
from sklearn.preprocessing import StandardScaler
from joblib import dump, load

def SS_scaling_initial(data, file_name='SS_scaler_1.joblib'):
    """Traines scaler, transforms data and saves scaler"""
    scaler = StandardScaler()
    scaler.fit(data)
    dump(scaler, file_name) 
    return pd.DataFrame(scaler.transform(data), columns = data.columns)


def SS_scale(data, file_name='SS_scaler_1.joblib'):
    """Loads scaler and applies it to data"""
    scaler = load(file_name)
    return pd.DataFrame(scaler.transform(data), columns = data.columns)

In [14]:
train_x_sc = SS_scaling_initial(train_x)

In [15]:
test_x_sc = SS_scale(test_x)
val_x_sc = SS_scale(val_x)
test_x_sc.head()

Unnamed: 0,living_area,kitchen_area,level,first,last,max_level,rooms_1,rooms_2,rooms_3,rooms_4,rooms_5,district_Ленинский,district_Ленинский (левый берег),district_Орджоникидзевский,district_Орджоникидзевский (левый берег),district_Правобережный
0,0.963573,-0.362188,-0.340875,-0.520329,1.776835,-1.061366,-0.530979,-0.831782,1.45912,-0.216247,-0.083189,1.59649,-0.102062,-0.886158,-0.264297,-0.504292
1,-1.268494,-0.838664,-1.098086,1.921861,-0.562798,-0.46454,1.883315,-0.831782,-0.685344,-0.216247,-0.083189,1.59649,-0.102062,-0.886158,-0.264297,-0.504292
2,-1.714908,-1.156316,-0.71948,-0.520329,-0.562798,-0.46454,1.883315,-0.831782,-0.685344,-0.216247,-0.083189,-0.626374,-0.102062,1.128467,-0.264297,-0.504292
3,-0.226863,-0.203362,-0.71948,-0.520329,-0.562798,-1.061366,-0.530979,1.202239,-0.685344,-0.216247,-0.083189,1.59649,-0.102062,-0.886158,-0.264297,-0.504292
4,0.264192,-0.838664,-1.098086,1.921861,-0.562798,-1.359779,-0.530979,-0.831782,1.45912,-0.216247,-0.083189,-0.626374,-0.102062,1.128467,-0.264297,-0.504292


In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

In [17]:
def report(test_y, prediction):
    print('MSE: ', round(mean_squared_error(test_y, prediction), 2))
    print('MAE: ', round(mean_absolute_error(test_y, prediction), 2))
    print('MAPE: ', round(mean_absolute_percentage_error(test_y, prediction)*100, 2))

## Линейная регрессия

In [18]:
from sklearn.linear_model import LinearRegression

model = LinearRegression().fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  543019.0
MAE:  508.56
MAPE:  26.67


## Метод опорных векторов

Сначала "из коробки".

In [19]:
from sklearn.svm import SVR

model = SVR().fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  2224278.93
MAE:  1214.29
MAPE:  44.52


Теперь подберем оптимальные параметры на валидационной выборке и проверим результат на тесте.

In [20]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
def objective_SVR(trial: optuna.Trial):
    coef0 = trial.suggest_float('coef0', 0.0, 1.5, step=0.1)
    C = trial.suggest_float('C', 0.01, 5.0, step=0.5)
    # kernel = trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 8)
    
    model = SVR(C=C,
                kernel='poly',
                degree=degree,
                gamma='auto',
                coef0=coef0)

    model.fit(train_x, train_y)
    prediction = model.predict(val_x)
    mse = mean_squared_error(val_y, prediction)
    mape = mean_absolute_percentage_error(val_y, prediction)

    return mape, mse

In [22]:
study_SVR = optuna.create_study(directions=["minimize", "minimize"])
study_SVR.optimize(objective_SVR, n_trials=100, n_jobs=4, gc_after_trial=True)

[I 2024-01-06 07:28:45,044] A new study created in memory with name: no-name-d8e5f0b7-6ca5-4ef9-894a-ff588675a346
[I 2024-01-06 07:28:45,086] Trial 2 finished with values: [0.14282869962531214, 682199.8241842937] and parameters: {'coef0': 0.8, 'C': 3.51, 'degree': 2}. 
[I 2024-01-06 07:28:45,153] Trial 4 finished with values: [0.15807324239303883, 804179.1592130791] and parameters: {'coef0': 0.5, 'C': 1.51, 'degree': 2}. 
[I 2024-01-06 08:50:53,534] Trial 3 finished with values: [10.498885313107362, 50556873737.65551] and parameters: {'coef0': 1.2000000000000002, 'C': 1.51, 'degree': 5}. 
[I 2024-01-06 12:45:24,340] Trial 0 finished with values: [7.4068324011188595, 8707588901.734774] and parameters: {'coef0': 1.5, 'C': 1.01, 'degree': 5}. 
[I 2024-01-06 12:48:19,324] Trial 7 finished with values: [0.3576720087423377, 12238877.41827811] and parameters: {'coef0': 1.0, 'C': 0.51, 'degree': 4}. 


KeyboardInterrupt: 

In [23]:
study_SVR.best_trials

[FrozenTrial(number=2, state=TrialState.COMPLETE, values=[0.14282869962531214, 682199.8241842937], datetime_start=datetime.datetime(2024, 1, 6, 7, 28, 45, 49680), datetime_complete=datetime.datetime(2024, 1, 6, 7, 28, 45, 86593), params={'coef0': 0.8, 'C': 3.51, 'degree': 2}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'coef0': FloatDistribution(high=1.5, log=False, low=0.0, step=0.1), 'C': FloatDistribution(high=4.51, log=False, low=0.01, step=0.5), 'degree': IntDistribution(high=8, log=False, low=2, step=1)}, trial_id=2, value=None)]

In [49]:
model = SVR(C=3.51,
            kernel='poly',
            gamma='auto',
            degree=2,
            coef0=0.8).fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  1851270.63
MAE:  1094.79
MAPE:  41.61


# CatBoostRegressor

Здесь используем модель "из коробки". Она сама определяет оптимальные параметры.

In [23]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(logging_level='Silent').fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  458585.83
MAE:  450.45
MAPE:  22.19


In [24]:
features_results = model.get_feature_importance()
columns = train_x_sc.columns
for index in range(len(features_results)):
    print(columns[index], ':', features_results[index].round(3))

living_area : 35.516
kitchen_area : 20.253
level : 6.339
first : 1.008
last : 4.494
max_level : 10.87
rooms_1 : 3.186
rooms_2 : 2.804
rooms_3 : 1.641
rooms_4 : 0.589
rooms_5 : 0.049
district_Ленинский : 2.265
district_Ленинский (левый берег) : 0.089
district_Орджоникидзевский : 2.945
district_Орджоникидзевский (левый берег) : 4.975
district_Правобережный : 2.976


# Случайный лес

In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
model = RandomForestRegressor(random_state=42).fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  473781.81
MAE:  489.43
MAPE:  21.81


Попробуем подобрать параметры.

In [49]:
def objective_forest(trial: optuna.Trial):
    n_estimators = trial.suggest_int('n_estimators', 10, 1000, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 5)
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error',
                                                        'friedman_mse', 'poisson'])
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2',
                                                              1.0, 0.5])

    model = RandomForestRegressor(n_estimators=n_estimators, criterion=criterion,
                                  min_samples_split=min_samples_split, max_features=max_features,
                                  random_state=42)


    model.fit(train_x, train_y)
    prediction = model.predict(val_x)
    mse = mean_squared_error(val_y, prediction)
    mape = mean_absolute_percentage_error(val_y, prediction)

    return mape, mse

In [50]:
study_forest = optuna.create_study(directions=["minimize", "minimize"])
study_forest.optimize(objective_forest, n_trials=1000, n_jobs=8, gc_after_trial=True)

[I 2024-01-07 01:49:36,933] A new study created in memory with name: no-name-2dd6df7a-105f-4b55-a101-e319bf8d3ada
[I 2024-01-07 01:49:39,361] Trial 4 finished with values: [0.11075879639083058, 409553.3603113961] and parameters: {'n_estimators': 300, 'min_samples_split': 2, 'criterion': 'squared_error', 'max_features': 0.5}. 
[I 2024-01-07 01:49:40,177] Trial 2 finished with values: [0.12156816637514771, 532736.4884710037] and parameters: {'n_estimators': 580, 'min_samples_split': 3, 'criterion': 'squared_error', 'max_features': 'sqrt'}. 
[I 2024-01-07 01:49:40,929] Trial 0 finished with values: [0.11927488485190667, 438474.09182120516] and parameters: {'n_estimators': 670, 'min_samples_split': 3, 'criterion': 'squared_error', 'max_features': 0.5}. 
[I 2024-01-07 01:49:41,479] Trial 5 finished with values: [0.11233664000211092, 426392.84879098175] and parameters: {'n_estimators': 690, 'min_samples_split': 2, 'criterion': 'squared_error', 'max_features': 0.5}. 
[I 2024-01-07 01:49:42,07

In [51]:
study_forest.best_trials

[FrozenTrial(number=669, state=TrialState.COMPLETE, values=[0.115078818668294, 396990.0742677184], datetime_start=datetime.datetime(2024, 1, 7, 1, 55, 32, 42515), datetime_complete=datetime.datetime(2024, 1, 7, 1, 55, 32, 300500), params={'n_estimators': 40, 'min_samples_split': 4, 'criterion': 'friedman_mse', 'max_features': 0.5}, user_attrs={}, system_attrs={'nsga2:generation': 11}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=1000, log=False, low=10, step=5), 'min_samples_split': IntDistribution(high=5, log=False, low=2, step=1), 'criterion': CategoricalDistribution(choices=('squared_error', 'absolute_error', 'friedman_mse', 'poisson')), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2', 1.0, 0.5))}, trial_id=669, value=None),
 FrozenTrial(number=678, state=TrialState.COMPLETE, values=[0.11587176218505564, 396157.7077080608], datetime_start=datetime.datetime(2024, 1, 7, 1, 55, 34, 686945), datetime_complete=datetime.datetime(2024, 1, 7, 1

In [27]:
model = RandomForestRegressor(n_estimators=195, min_samples_split=2,criterion='squared_error', max_features=0.5,
                             random_state=42).fit(train_x_sc, train_y)
prediction = model.predict(test_x_sc)
report(test_y, prediction)

MSE:  476920.29
MAE:  456.5
MAPE:  22.0


In [29]:
features_results = model.feature_importances_
columns = train_x_sc.columns
for index in range(len(features_results)):
    print(columns[index], ':', features_results[index].round(3))

living_area : 0.393
kitchen_area : 0.22
level : 0.047
first : 0.01
last : 0.016
max_level : 0.097
rooms_1 : 0.053
rooms_2 : 0.021
rooms_3 : 0.05
rooms_4 : 0.009
rooms_5 : 0.001
district_Ленинский : 0.016
district_Ленинский (левый берег) : 0.003
district_Орджоникидзевский : 0.013
district_Орджоникидзевский (левый берег) : 0.037
district_Правобережный : 0.014


Возьмем лучшую модель и проверим на нескольких объявлениях с Avito для Магнитогорска.

In [30]:
best_model = RandomForestRegressor(random_state=42).fit(train_x_sc, train_y)

In [40]:
test_tasks = {'living_area': [17.0, 31.0, 44.0],
              'kitchen_area': [6.0, 6.0, 9.0],
              'level': [6, 5, 9],
              'first': [0, 0, 0],
              'last': [0, 1, 1],
              'max_level': [9, 5, 9],
              'rooms': [1, 2, 3],
              'district': ['Правобережный', 'Правобережный', 'Орджоникидзевский'],
              'price': [1850, 2950, 4900]
             }
                  
independent_test = pd.DataFrame(test_tasks)
independent_test_true_price = independent_test['price']
independent_test = independent_test.drop(['price'], axis=1)

In [41]:
independent_test

Unnamed: 0,living_area,kitchen_area,level,first,last,max_level,rooms,district
0,17.0,6.0,6,0,0,9,1,Правобережный
1,31.0,6.0,5,0,1,5,2,Правобережный
2,44.0,9.0,9,0,1,9,3,Орджоникидзевский


In [42]:
independent_test = pd.concat([independent_test, pd.DataFrame(coder_rooms.transform(np.array(independent_test['rooms']).reshape(-1, 1)),
                                                             columns=coder_rooms.get_feature_names_out(['rooms']),
                                                             index=independent_test.index)],
                             axis=1)
independent_test = independent_test.drop(['rooms'], axis=1)

independent_test = pd.concat([independent_test, pd.DataFrame(coder_district.transform(np.array(independent_test['district']).reshape(-1, 1)),
                                                             columns=coder_district.get_feature_names_out(['district']),
                                                             index=independent_test.index)],
                             axis=1)
independent_test = independent_test.drop(['district'], axis=1)

independent_test = SS_scale(independent_test)

In [46]:
independent_prediction = best_model.predict(independent_test)

In [47]:
independent_prediction

array([1958.2, 2898. , 4201.4])

In [48]:
report(independent_test_true_price, independent_prediction)

MSE:  167484.4
MAE:  286.27
MAPE:  7.29


Модель опирается на очень грубые свойства объекта недвижимости и при этом дает неплохую точность. 

In [127]:
# dump(model, 'random_forest_model.joblib') 