In [1]:
import argparse
import logging
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error as mae, mean_absolute_percentage_error as mape, make_scorer
from sklearn.model_selection import cross_val_score
from joblib import dump
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape

from lightgbm import LGBMRegressor
import optuna

import warnings
warnings.filterwarnings('ignore')



In [2]:
train = pd.read_csv('../data/proc/train.csv')
test = pd.read_csv('../data/proc/val.csv')
train.head()

Unnamed: 0,floor,floors_count,rooms_count,total_meters,price,top_bottom_floor,county_short_ВАО,county_short_ЗАО,county_short_ЗелАО,county_short_САО,county_short_СВАО,county_short_СЗАО,county_short_ТАО,county_short_ЦАО,county_short_ЮАО,county_short_ЮВАО,county_short_ЮЗАО,object_type_secondary,elite_estate
0,2,9,2,46.0,10500000.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
1,16,30,1,33.4,13067704.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,6,7,2,94.0,130000000.0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
3,2,31,3,82.9,29900000.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,10,25,2,56.0,33000000.0,0,0,0,0,0,0,1,0,0,0,0,0,1,1


In [3]:
X = train.drop(columns='price')
y = np.log(train.price)

X_test = test.drop(columns='price')
y_test = test.price

# Классическая линейная регрессия с ln(y)

In [5]:
lm = LinearRegression().fit(X, y)
y_fitted = np.exp(lm.predict(X))
y_predicted = np.exp(lm.predict(X_test))

print('R2 on train: %.3f'%lm.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.537 

MAE on train: 3,330,942
MAE on test: 3,243,598
MAPE on train: 0.219 
MAPE on test: 0.213 


# Pipelines with Scaler, dimentionaly reduction and regularization

In [6]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 2.66 s
Wall time: 13.5 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'scaler': RobustScaler()}
Best R2 0.475


In [7]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 6.42 s
Wall time: 19.3 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0.1), 'regressor__alpha': 0.1, 'scaler': RobustScaler()}
Best MAE 0.225


In [8]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('pca', PCA()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'pca': [PCA()],         
    'pca__n_components': range(1, 6),
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
CPU times: total: 5.38 s
Wall time: 19.1 s
Best params : {'pca': PCA(n_components=5), 'pca__n_components': 5, 'regressor': HuberRegressor(alpha=0), 'regressor__alpha': 0, 'scaler': RobustScaler()}
Best MAPE 0.014


In [9]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('pca', grid_search.best_params_['pca']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.476
MAE on train: 3,330,942
Pipeline MAPE: 0.2301


In [10]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: {:,.0f}'.format(mae(y_test, np.exp(predicted))))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 3,434,499
Pipeline test MAPE: 0.2222


# Pipelines with Scaler and Regularization

In [10]:
# Pipeline with dimentionality reduction, target metric - R-square

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best R2 %.3f"% grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 1.86 s
Wall time: 6.46 s
Best params : {'regressor': Ridge(alpha=10), 'regressor__alpha': 10, 'scaler': StandardScaler()}
Best R2 0.797


In [11]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 2.27 s
Wall time: 6.88 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAE 0.298


In [12]:
# Pipeline with dimentionality reduction, target metric - MAE

pipeline = Pipeline([
('scaler', StandardScaler()),
('regressor', Lasso())
])

param_grid = {'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'regressor': [Lasso(), Ridge(), HuberRegressor()],
    'regressor__alpha': [0, 0.01, 0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAPE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 1.59 s
Wall time: 6.57 s
Best params : {'regressor': HuberRegressor(alpha=1.0), 'regressor__alpha': 1.0, 'scaler': RobustScaler()}
Best MAPE 0.017


In [13]:
pipeline = Pipeline([
('scaler', grid_search.best_params_['scaler']),
('regressor', grid_search.best_params_['regressor'])
])

pipeline.fit(X, y)
print('Pipeline R-square: %.3f'%pipeline.score(X, y))
fitted = pipeline.predict(X)
print('Pipeline MAE {:,.0f}'.format(mae(np.exp(y), np.exp(fitted))))
print('Pipeline MAPE: %.4f'%mape(np.exp(y), np.exp(fitted)))

Pipeline R-square: 0.789
Pipeline MAE 15,850,483
Pipeline MAPE: 0.3327


In [14]:
predicted = pipeline.predict(X_test)
print('Pipeline test MAE: {:,.0f}'.format(mae(y_test, np.exp(predicted))))
print('Pipeline test MAPE: %.4f'%mape(y_test, np.exp(predicted)))

Pipeline test MAE: 16,721,307
Pipeline test MAPE: 0.3243


# Random Forest

In [11]:
rf = RandomForestRegressor(random_state=0).fit(X, y)
y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format( mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.963 

MAE on train: 899,936
MAE on test: 2,375,257
MAPE on train: 0.056 
MAPE on test: 0.150 


In [12]:
estimator = RandomForestRegressor(random_state=0, n_jobs=-1)
param_grid = {'max_depth':  range(7, 10),
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10]}

grid_search = GridSearchCV(estimator, param_grid,cv=5, scoring='neg_mean_absolute_error', verbose=1, n_jobs=10)
%time grid_search.fit(X, y)

print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
CPU times: total: 3.22 s
Wall time: 1min 17s
Best params : {'max_depth': 9, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2}
Best MAE 0.173


In [13]:
rf = RandomForestRegressor(random_state=0, n_jobs=-1, **grid_search.best_params_).fit(X, y)

y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.774 

MAE on train: 2,356,800
MAE on test: 2,685,352
MAPE on train: 0.150 
MAPE on test: 0.171 


# LGBM

In [14]:
lgbm = LGBMRegressor(random_state=0, force_row_wise=True).fit(X, y)
y_fitted = np.exp(lgbm.predict(X))
y_predicted = np.exp(lgbm.predict(X_test))

print('R2 on train: %.3f'%lgbm.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

  File "C:\Users\shuva\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 5236, number of used features: 16
[LightGBM] [Info] Start training from score 16.538973
R2 on train: 0.807 

MAE on train: 2,176,704
MAE on test: 2,523,572
MAPE on train: 0.137 
MAPE on test: 0.160 


In [15]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 100), 
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.25), 
        'num_leaves': trial.suggest_int("num_leaves", 10, 20),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 100.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 100.0)
    }

    model = LGBMRegressor(**params, random_state=0, n_jobs=-1, force_row_wise=True)
    return cross_val_score(model, X, y, cv=9, scoring='neg_mean_absolute_error', n_jobs=-1).mean()

In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True, n_jobs=-1)

[I 2024-05-27 12:38:31,529] A new study created in memory with name: no-name-3260e561-2aaa-4cb4-8cdc-1835f3885de8


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-05-27 12:38:37,849] Trial 7 finished with value: -0.2255387332051293 and parameters: {'n_estimators': 71, 'max_depth': 2, 'learning_rate': 0.20572384569087449, 'num_leaves': 20, 'reg_alpha': 72.0256200805694, 'reg_lambda': 1.6853727528921925}. Best is trial 7 with value: -0.2255387332051293.
[I 2024-05-27 12:38:38,529] Trial 3 finished with value: -0.18605792403053303 and parameters: {'n_estimators': 73, 'max_depth': 3, 'learning_rate': 0.20962114783430003, 'num_leaves': 11, 'reg_alpha': 10.71384441265338, 'reg_lambda': 76.20350900102852}. Best is trial 3 with value: -0.18605792403053303.
[I 2024-05-27 12:38:38,682] Trial 0 finished with value: -0.1852366286472938 and parameters: {'n_estimators': 81, 'max_depth': 5, 'learning_rate': 0.20386340847558576, 'num_leaves': 14, 'reg_alpha': 12.397235554170663, 'reg_lambda': 23.398761440112164}. Best is trial 0 with value: -0.1852366286472938.
[I 2024-05-27 12:38:38,864] Trial 1 finished with value: -0.19871728469460603 and parameters:

In [17]:
print('Best hyperparameters:', study.best_params)
print('Best MAE: %.3f'% -study.best_value)

Best hyperparameters: {'n_estimators': 86, 'max_depth': 5, 'learning_rate': 0.24681488010947256, 'num_leaves': 14, 'reg_alpha': 1.0374405840419012, 'reg_lambda': 28.361258497509773}
Best MAE: 0.166


In [19]:
lgbm = LGBMRegressor(random_state=0, n_jobs=-1, **study.best_params).fit(X, y)

y_fitted = np.exp(lgbm.predict(X))
y_predicted = np.exp(lgbm.predict(X_test))

print('R2 on train: %.3f'%lgbm.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 5236, number of used features: 16
[LightGBM] [Info] Start training from score 16.538973
R2 on train: 0.758 

MAE on train: 2,452,583
MAE on test: 2,649,701
MAPE on train: 0.155 
MAPE on test: 0.169 


# KNN

In [30]:
from sklearn.neighbors import KNeighborsRegressor as knn

In [31]:
rf = knn().fit(X, y)
y_fitted = np.exp(rf.predict(X))
y_predicted = np.exp(rf.predict(X_test))

print('R2 on train: %.3f'%rf.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format( mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.890 

MAE on train: 9,540,852
MAE on test: 13,422,751
MAPE on train: 0.219 
MAPE on test: 0.279 


In [32]:
estimator = knn(n_jobs=-1)
param_grid = {'algorithm':  ['auto', 'ball_tree', 'kd_tree', 'brute'],
 'weights': ['uniform', 'distance'],
 'n_neighbors': [2, 3],
 'leaf_size': [20,30,40,50, 60,70],
    'p': [1,2]}

grid_search = GridSearchCV(estimator, param_grid,cv=9, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
%time grid_search.fit(X, y)

Fitting 9 folds for each of 192 candidates, totalling 1728 fits
CPU times: total: 1.27 s
Wall time: 31.4 s


In [33]:
print("Best params :", grid_search.best_params_)
print("Best MAE %.3f"% -grid_search.best_score_)

Best params : {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Best MAE 0.205


In [34]:
knn = knn(n_jobs=-1, **grid_search.best_params_).fit(X, y)

y_fitted = np.exp(knn.predict(X))
y_predicted = np.exp(knn.predict(X_test))

print('R2 on train: %.3f'%knn.score(X, y), '\n')

print('MAE on train: {:,.0f}'.format(mae(np.exp(y), y_fitted)))
print('MAE on test: {:,.0f}'.format(mae(y_test, y_predicted), '\n'))

print('MAPE on train: %.3f '% mape(np.exp(y), y_fitted))
print('MAPE on test: %.3f '% mape(y_test, y_predicted))

R2 on train: 0.998 

MAE on train: 416,845
MAE on test: 9,969,692
MAPE on train: 0.006 
MAPE on test: 0.217 
