In [1]:
import argparse
import logging
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_absolute_percentage_error as mape, make_scorer
from sklearn.model_selection import cross_val_score
from joblib import dump
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape

from lightgbm import LGBMRegressor
import optuna

import warnings
warnings.filterwarnings('ignore')



In [2]:
train = pd.read_csv('../data/proc/train.csv')
test = pd.read_csv('../data/proc/val.csv')
train.head()

Unnamed: 0,floor,floors_count,rooms_count,total_meters,price,top_bottom_floor,county_short_ВАО,county_short_ЗАО,county_short_ЗелАО,county_short_САО,county_short_СВАО,county_short_СЗАО,county_short_ТАО,county_short_ЦАО,county_short_ЮАО,county_short_ЮВАО,county_short_ЮЗАО,object_type_secondary
0,19,24,2,70.0,42000000.0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,3,14,3,68.7,38500000.0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,13,44,1,34.3,16600000.0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,6,17,3,72.0,18900000.0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,3,5,1,20.3,5999999.0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [3]:
X = train.drop(columns='price')
y = np.log(train.price)

X_test = test.drop(columns='price')
y_test = test.price

# Классическая линейная регрессия с ln(y)

In [7]:
lm = LinearRegression().fit(X, y)
y_fitted = np.exp(lm.predict(X))
y_predicted = np.exp(lm.predict(X_test))

print('R2 on train: %.3f'%lm.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.809 

MAE on train: 15,585,627
MAE on test: 16,443,532
MAPE on train: 0.327 
MAPE on test: 0.306 


# Pipelines with Scaler, dimentionaly reduction and regularization

In [8]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 2.64 s
Wall time: 16.8 s
Best params : {'pca': PCA(n_components=3), 'pca__n_components': 3, 'regressor': Ridge(alpha=0.01), 'regressor__alpha': 0.01, 'scaler': StandardScaler()}
Best R2 0.759


In [9]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 4.75 s
Wall time: 20.7 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0.01), 'regressor__alpha': 0.01, 'scaler': RobustScaler()}
Best MAE 0.334


In [10]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 5.33 s
Wall time: 20.5 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0.01), 'regressor__alpha': 0.01, 'scaler': RobustScaler()}
Best MAPE 0.019


In [12]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('pca', grid_search.best_params_['pca']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('Pipeline MAE: {:,.0f}'.format(mae(np.exp(y), np.exp(fitted))))
print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.749
Pipeline MAE: 20,417,572
MAE on train: 15,585,627
Pipeline MAPE: 0.3793


In [13]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: {:,.0f}'.format(mae(y_test, np.exp(predicted))))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 20,092,720
Pipeline test MAPE: 0.3552


# Pipelines with Scaler and Regularization

In [14]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 859 ms
Wall time: 5.6 s
Best params : {'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'scaler': StandardScaler()}
Best R2 0.808


In [15]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 797 ms
Wall time: 5.88 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAE 0.300


In [16]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 828 ms
Wall time: 6.01 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAPE 0.017


In [17]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('Pipeline MAE {:,.0f}'.format(mae(np.exp(y), np.exp(fitted))))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.804
Pipeline MAE 16,798,705
Pipeline MAPE: 0.3339


In [18]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: {:,.0f}'.format(mae(y_test, np.exp(predicted))))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 16,925,166
Pipeline test MAPE: 0.3100


# Random Forest

In [19]:
rf = RandomForestRegressor(random_state=0).fit(X, y)
y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format( mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.986 

MAE on train: 3,785,488
MAE on test: 10,131,021
MAPE on train: 0.075 
MAPE on test: 0.201 


In [20]:
estimator = RandomForestRegressor(random_state=0, n_jobs=-1)
param_grid = {'max_depth':  range(7, 10),
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(estimator, param_grid,cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 3.56 s
Wall time: 1min 26s
Best params : {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}
Best MAE 0.231


In [21]:
rf = RandomForestRegressor(random_state=0, n_jobs=-1, **grid_search.best_params_).fit(X, y)

y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.923 

MAE on train: 8,759,078
MAE on test: 12,001,613
MAPE on train: 0.199 
MAPE on test: 0.229 


# LGBM

In [22]:
lgbm = LGBMRegressor(random_state=0, force_row_wise=True).fit(X, y)
y_fitted = np.exp(lgbm.predict(X))
y_predicted = np.exp(lgbm.predict(X_test))

print('R2 on train: %.3f'%lgbm.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

  File "C:\Users\shuva\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


[LightGBM] [Info] Total Bins 424
[LightGBM] [Info] Number of data points in the train set: 6780, number of used features: 16
[LightGBM] [Info] Start training from score 17.102996
R2 on train: 0.923 

MAE on train: 9,468,360
MAE on test: 11,463,176
MAPE on train: 0.197 
MAPE on test: 0.219 


In [5]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 300), 
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15), 
        'num_leaves': trial.suggest_int("num_leaves", 20, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 100.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 100.0)
    }

    model = LGBMRegressor(**params, random_state=0, n_jobs=-1, force_row_wise=True)
    return cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

[I 2024-05-23 22:11:08,393] A new study created in memory with name: no-name-4d3e51ae-a232-40ff-8054-d34af1f14b42


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-05-23 22:11:12,912] Trial 5 finished with value: -0.2890180566012582 and parameters: {'n_estimators': 194, 'max_depth': 4, 'learning_rate': 0.14031861013286173, 'num_leaves': 87, 'reg_alpha': 82.58952076855365, 'reg_lambda': 75.06077793797382}. Best is trial 5 with value: -0.2890180566012582.
[I 2024-05-23 22:11:12,952] Trial 7 finished with value: -0.2843014432630995 and parameters: {'n_estimators': 242, 'max_depth': 12, 'learning_rate': 0.1429547235533014, 'num_leaves': 31, 'reg_alpha': 74.10991137443285, 'reg_lambda': 46.73747771401997}. Best is trial 7 with value: -0.2843014432630995.
[I 2024-05-23 22:11:13,356] Trial 3 finished with value: -0.28776479188278387 and parameters: {'n_estimators': 152, 'max_depth': 4, 'learning_rate': 0.11909600155782436, 'num_leaves': 31, 'reg_alpha': 79.5558971587826, 'reg_lambda': 39.84341523882324}. Best is trial 7 with value: -0.2843014432630995.
[I 2024-05-23 22:11:13,836] Trial 1 finished with value: -0.24974025492738447 and parameters: 

In [7]:
print('Best hyperparameters:', study.best_params)
print('Best MAE: %.0f'% -study.best_value)

Best hyperparameters: {'n_estimators': 269, 'max_depth': 9, 'learning_rate': 0.05096330723976655, 'num_leaves': 98, 'reg_alpha': 0.17995860538849384, 'reg_lambda': 30.29993490458629}
Best MAE: 0


In [8]:
lgbm = LGBMRegressor(random_state=0, n_jobs=-1, **study.best_params).fit(X, y)

y_fitted = np.exp(lgbm.predict(X))
y_predicted = np.exp(lgbm.predict(X_test))

print('R2 on train: %.3f'%lgbm.score(X, y), '\n')

print('MAE on train: %.0f '% mae(np.exp(y), y_fitted))
print('MAE on test: %.0f'% mae(y_test, y_predicted), '\n')

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 424
[LightGBM] [Info] Number of data points in the train set: 6780, number of used features: 16
[LightGBM] [Info] Start training from score 17.102996
R2 on train: 0.924 

MAE on train: 9430928 
MAE on test: 11215844 

MAPE on train: 0.195 
MAPE on test: 0.217 
