In [110]:
import pandas as pd

In [112]:
X_full = pd.read_csv('msk_test.csv')
X_full.shape

(1472, 29)

In [None]:
X = X_full.drop(columns={'price', 'address'}, axis=1)

In [None]:
y = X_full['price']

In [None]:
X.shape

(5978, 24)

In [None]:
X.describe()

Unnamed: 0,total_area,kitchen_area,living_area,rooms_count,floor,floors_number,build_date,isСomplete,complitation_year,balcony,longitude,latitude,passenger_elevator,cargo_elevator,is_apartments,is_auction
count,5978.0,3702.0,3834.0,5248.0,5978.0,5978.0,1497.0,3797.0,3869.0,1405.0,5978.0,5978.0,3254.0,2693.0,5040.0,5978.0
mean,66.546173,14.366667,33.818258,2.102515,7.840248,16.875878,1999.291917,0.134317,2024.574309,0.8,50.76008,56.604465,2.464352,2.580765,0.063492,0.238207
std,59.392769,10.142335,33.068501,1.043265,7.354915,9.565322,31.715727,0.341037,1.351384,0.69839,17.221261,1.593255,3.030908,3.26775,0.24387,0.426022
min,12.0,1.0,2.1,1.0,-2.0,1.0,1770.0,0.0,2013.0,0.0,29.514256,54.856743,0.0,0.0,0.0,0.0
25%,38.615,8.0,16.0,1.0,3.0,9.0,1985.0,0.0,2024.0,0.0,37.565079,55.731537,1.0,1.0,0.0,0.0
50%,53.185,12.7,26.6,2.0,5.0,17.0,2013.0,0.0,2025.0,1.0,44.075142,56.219562,2.0,1.0,0.0,0.0
75%,73.8525,17.9,39.9,3.0,11.0,24.0,2020.0,0.0,2025.0,1.0,60.603778,56.819113,3.0,3.0,0.0,0.0
max,1285.8,168.7,700.0,6.0,82.0,95.0,2026.0,1.0,2028.0,4.0,83.123521,60.194665,30.0,30.0,1.0,1.0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
categorial_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

In [None]:
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

In [None]:
my_cols = categorial_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_val[my_cols].copy()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="median")),
])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorial_cols)
    ])

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_jobs=-1, random_state=42, verbose=1)

In [None]:
clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'model__n_estimators' : list(range(100, 501, 50)),
          'model__max_depth' :list(range(1, 8, 2)),
          }
best_clf = GridSearchCV(clf_pipeline, params, cv=5, verbose=1)

In [None]:
best_clf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parall

In [None]:
predicitions = best_clf.predict(X_valid)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [None]:
best_clf

In [None]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
print('MAE:', mean_absolute_error(y_val, predicitions))
print('R^2:', r2_score(y_val, predicitions))
print('MAPE:', mean_absolute_percentage_error(y_val, predicitions))
print('RMSE:', (mean_squared_error(y_val, predicitions)) ** (1/2))


MAE: 10420449.648660738
R^2: 0.6475610703818333
MAPE: 0.30855819284640185
RMSE: 56607452.964881115
