In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [70]:
train = pd.read_csv("train.csv", index_col='id')
print(train.shape)
train.head(2)

(100000, 24)


Unnamed: 0_level_0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-1,560,,2,59,3,0,30.0,1.0,5,...,0,0,0,0,0,0,0,0,0,4510000
1,2011-1,667,,10,50,2,1,25.0,,1,...,0,0,0,0,0,0,0,0,0,13231000


In [71]:
test = pd.read_csv("test.csv", index_col='id')
print(test.shape)
test.head(2)

(100000, 23)


Unnamed: 0_level_0,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,...,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000,2012-3,459,,1,60,3,1,30.0,0.0,4,...,0,0,0,0,0,0,0,0,0,0
100001,2012-3,344,1.0,10,52,2,1,,,4,...,0,0,0,0,0,0,0,0,0,0


**Поиск и заполнение пропущенных значений**

In [72]:
def calculate_missing_values(df):
    total_rows = df.shape[0]
    for column in df.columns:
        missing_values = df[column].isnull().sum()
        if missing_values > 0:
            missing_percentage = (missing_values / total_rows) * 100
            print(f"Столбец '{column}' имеет {missing_percentage:.2f}% пропущенных значений, тип данных: '{df[column].dtype}'")

calculate_missing_values(train)

Столбец 'build_tech' имеет 29.73% пропущенных значений, тип данных: 'float64'
Столбец 'metro_dist' имеет 4.90% пропущенных значений, тип данных: 'float64'
Столбец 'g_lift' имеет 29.99% пропущенных значений, тип данных: 'float64'


In [73]:
#Округление значения до ближайшего значения из unique_metro_dists
def round_to_nearest(value, unique_metro_dists):
    nearest_value = min(unique_metro_dists, key=lambda x: abs(x - value))
    return nearest_value

In [74]:
def preprocessing(df):
    df['metro_dist'].fillna(df.groupby('street_id')['metro_dist'].transform('mean'), inplace=True)
    df['metro_dist'].fillna(df['metro_dist'].mean(), inplace=True)
    df['metro_dist'] = df['metro_dist'].apply(round_to_nearest, args=(df['metro_dist'].unique(),))
    
    df['build_tech'].fillna(df.groupby('street_id')['build_tech'].agg(lambda x: x.value_counts().index[0]), inplace=True)
    df['build_tech'].fillna(df['build_tech'].agg(lambda x: x.value_counts().index[0]), inplace=True)


    df['g_lift'].fillna(df.groupby('street_id')['g_lift'].agg(lambda x: x.value_counts().index[0]), inplace=True)
    df['g_lift'].fillna(df['g_lift'].agg(lambda x: x.value_counts().index[0]), inplace=True)
    
    df.drop(['date'], axis=1, inplace=True)
    
    return df

**Даты в train и test не совпадают, видно, что данные были разделены именно по ним, можно их удалить**

In [75]:
train['date'].unique()

array(['2011-1', '2011-10', '2011-11', '2011-12', '2011-2', '2011-3',
       '2011-4', '2011-5', '2011-6', '2011-7', '2011-8', '2011-9',
       '2012-1', '2012-10', '2012-11', '2012-12', '2012-2', '2012-3'],
      dtype=object)

In [76]:
test['date'].unique()

array(['2012-3', '2012-4', '2012-5', '2012-6', '2012-7', '2012-8',
       '2012-9', '2013-1', '2013-10', '2013-11', '2013-12', '2013-2',
       '2013-3', '2013-4', '2013-5', '2013-6', '2013-7', '2013-8',
       '2013-9'], dtype=object)

In [77]:
train = preprocessing(train)
calculate_missing_values(train)

In [78]:
test = preprocessing(test)
calculate_missing_values(test)

In [79]:
train.head()

Unnamed: 0_level_0,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,560,0.0,2,59,3,0,30.0,1.0,5,0,...,0,0,0,0,0,0,0,0,0,4510000
1,667,0.0,10,50,2,1,25.0,1.0,1,0,...,0,0,0,0,0,0,0,0,0,13231000
2,90,0.0,1,48,2,0,25.0,0.0,1,0,...,0,0,0,0,0,0,0,0,0,2008000
3,94,1.0,3,62,3,1,30.0,1.0,3,0,...,0,0,0,0,0,0,0,0,0,12680000
4,232,0.0,3,60,3,0,25.0,1.0,3,0,...,0,0,0,0,0,0,0,0,0,3335000


**Подготовка данных к обучению и тестированию**

In [80]:
y = train['price']
X = train.drop(['price'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=13)

**Градиентный бустинг**

In [81]:
boosting_model = CatBoostRegressor(iterations=3000,
                           loss_function='MAE',  
                           eval_metric='MAE',  
                           logging_level='Silent',
                           early_stopping_rounds=1000)
grid = {
    'learning_rate': [0.15, 0.12, 0.25],
    'depth': [4, 6, 8, 10, 12]
}
grid_search_result = boosting_model.grid_search(grid, 
                                               X=X_train, 
                                               y=y_train,
                                               cv=5,
                                               train_size=0.8,
                                               plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	loss: 1455327.6012050	best: 1455327.6012050 (0)	total: 18.2s	remaining: 4m 15s
1:	loss: 1474313.8205987	best: 1455327.6012050 (0)	total: 36.3s	remaining: 3m 56s
2:	loss: 1436126.4163874	best: 1436126.4163874 (2)	total: 55s	remaining: 3m 40s
3:	loss: 1406069.3441049	best: 1406069.3441049 (3)	total: 1m 16s	remaining: 3m 30s
4:	loss: 1416398.9601654	best: 1406069.3441049 (3)	total: 1m 39s	remaining: 3m 18s
5:	loss: 1410936.0342794	best: 1406069.3441049 (3)	total: 2m 1s	remaining: 3m 2s
6:	loss: 1412374.3300485	best: 1406069.3441049 (3)	total: 2m 30s	remaining: 2m 52s
7:	loss: 1417551.9161642	best: 1406069.3441049 (3)	total: 2m 59s	remaining: 2m 37s
8:	loss: 1424629.1688405	best: 1406069.3441049 (3)	total: 3m 28s	remaining: 2m 19s
9:	loss: 1468910.5488835	best: 1406069.3441049 (3)	total: 4m 20s	remaining: 2m 10s
10:	loss: 1466664.5024005	best: 1406069.3441049 (3)	total: 5m 12s	remaining: 1m 53s
11:	loss: 1493676.9982952	best: 1406069.3441049 (3)	total: 6m 2s	remaining: 1m 30s
12:	loss: 

In [82]:
print("Best model parameters: " + str(grid_search_result['params'])) 
print("Model test score: " + str(round(mean_absolute_error(boosting_model.predict(X_test), y_test))))
print("Competition score: " + str(1 / (1 + mean_absolute_error(boosting_model.predict(X_test), y_test))))

Best model parameters: {'depth': 6, 'learning_rate': 0.15}
Model test score: 1369819
Competition score: 7.300231686292843e-07


In [92]:
#1374977

In [95]:
best_model = CatBoostRegressor(iterations=3000,
                               loss_function='MAE',  
                               eval_metric='MAE',  
                               logging_level='Silent',
                               depth=6,
                               learning_rate=0.15,
                               early_stopping_rounds=1000)
best_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x1d4063d7d00>

**Дерево решений**

In [39]:
tree_model = DecisionTreeRegressor(criterion="squared_error", max_depth=10)
tree_model.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10)

In [40]:
print("Model test score: " + str(round(mean_absolute_error(tree_model.predict(X_test), y_test))))

Model test score: 1819201


**Случайный лес**

In [105]:
parameters = {'n_estimators': [3000],
             'max_depth': [10, 15, 20]}
clf = RandomForestRegressor(criterion="squared_error")
gs_clf = GridSearchCV(clf, parameters, cv=5, n_jobs=-1)
gs_clf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20], 'n_estimators': [3000]})

In [106]:
print("Best model parameters: " + str(gs_clf.best_params_)) 
print("Model test score: " + str(round(mean_absolute_error(gs_clf.best_estimator_.predict(X_test), y_test))))
print("Competition score: " + str(1 / (1 + mean_absolute_error(gs_clf.best_estimator_.predict(X_test), y_test))))

Best model parameters: {'max_depth': 20, 'n_estimators': 3000}
Model test score: 1618756
Competition score: 6.177579311737805e-07


In [None]:
#'max_depth': 30, 'n_estimators': 1500
#1575394

In [100]:
best_rf_model = RandomForestRegressor(criterion="squared_error", max_depth=30, n_estimators=1500)
best_rf_model.fit(X, y)

RandomForestRegressor(max_depth=30, n_estimators=1500)

**Обработка test и предсказание результатов**

In [101]:
y_pred = best_rf_model.predict(test)

In [102]:
submission = pd.DataFrame({'id':test.index, 'price':y_pred})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,price
0,100000,5223500.0
1,100001,5315222.0
2,100002,2123831.0
3,100003,4431782.0
4,100004,6456958.0
