In [34]:
import argparse
import logging
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_absolute_percentage_error as mape, make_scorer
from joblib import dump
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape

import warnings
warnings.filterwarnings('ignore')

In [35]:
train = pd.read_csv('../data/proc/train.csv')
test = pd.read_csv('../data/proc/val.csv')
train.head()

Unnamed: 0,floor,floors_count,rooms_count,total_meters,price,top_bottom_floor,county_short_ВАО,county_short_ЗАО,county_short_ЗелАО,county_short_САО,county_short_СВАО,county_short_СЗАО,county_short_ТАО,county_short_ЦАО,county_short_ЮАО,county_short_ЮВАО,county_short_ЮЗАО,object_type_secondary
0,19,24,2,70.0,42000000.0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,3,14,3,68.7,38500000.0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,13,44,1,34.3,16600000.0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,6,17,3,72.0,18900000.0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,3,5,1,20.3,5999999.0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [36]:
X = train.drop(columns='price')
y = np.log(train.price)

X_test = test.drop(columns='price')
y_test = test.price

# Классическая линейная регрессия с ln(y)

In [37]:
lm = LinearRegression().fit(X, y)
y_fitted = np.exp(lm.predict(X))
y_predicted = np.exp(lm.predict(X_test))

print('R2 on train: %.3f'%lm.score(X, y), '\n')

print('MAE on train: %.0f '% mae(y, y_fitted))
print('MAE on test: %.0f'% mae(y_test, y_predicted), '\n')

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.809 

MAE on train: 41500174 
MAE on test: 16443532 

MAPE on train: 0.327 
MAPE on test: 0.306 


# Pipelines with Scaler, dimentionaly reduction and regularization

In [8]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 5.42 s
Wall time: 20.3 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': Ridge(alpha=0.1), 'regressor__alpha': 0.1, 'scaler': StandardScaler()}
Best R2 0.758


In [11]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 6.3 s
Wall time: 20.6 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0.01), 'regressor__alpha': 0.01, 'scaler': RobustScaler()}
Best MAE 0.334


In [10]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 4.95 s
Wall time: 20.7 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0.01), 'regressor__alpha': 0.01, 'scaler': RobustScaler()}
Best MAPE 0.019


In [13]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('pca', grid_search.best_params_['pca']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('Pipeline MAE: %.0f'%mae(np.exp(y), np.exp(fitted)))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.749
Pipeline MAE: 20417572
Pipeline MAPE: 0.3793


In [14]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: %.0f'%mae(y_test, np.exp(predicted)))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 20092720
Pipeline test MAPE: 0.3552


# Pipelines with Scaler and Regularization

In [15]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 1.23 s
Wall time: 5.79 s
Best params : {'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'scaler': StandardScaler()}
Best R2 0.808


In [16]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 1.25 s
Wall time: 6.24 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAE 0.300


In [17]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 1.12 s
Wall time: 6.1 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAPE 0.017


In [18]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('Pipeline MAE: %.0f'%mae(np.exp(y), np.exp(fitted)))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.804
Pipeline MAE: 16798705
Pipeline MAPE: 0.3339


In [19]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: %.0f'%mae(y_test, np.exp(predicted)))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 16925166
Pipeline test MAPE: 0.3100


# Random Forest

In [40]:
rf = RandomForestRegressor(random_state=0).fit(X, y)
y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: %.0f '% mae(np.exp(y), y_fitted))
print('MAE on test: %.0f'% mae(y_test, y_predicted), '\n')

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.986 

MAE on train: 3785488 
MAE on test: 10131021 

MAPE on train: 0.075 
MAPE on test: 0.201 


In [42]:
estimator = RandomForestRegressor(random_state=0, n_jobs=-1)
param_grid = {'max_depth':  range(7, 10),
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(estimator, param_grid,cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 3.8 s
Wall time: 1min 33s
Best params : {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best MAE 0.231


In [43]:
rf = RandomForestRegressor(random_state=0, n_jobs=-1, **grid_search.best_params_).fit(X, y)

y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: %.0f '% mae(np.exp(y), y_fitted))
print('MAE on test: %.0f'% mae(y_test, y_predicted), '\n')

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.923 

MAE on train: 8759078 
MAE on test: 12001613 

MAPE on train: 0.199 
MAPE on test: 0.229 
