In [105]:
# Data analysis imports
import pandas as pd
import numpy as np

# Machine learning imports
import xgboost as xgb
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

# Hyperopt imports
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, rand

# Plotting imports
import matplotlib.pyplot as plt
import seaborn as sns

In [106]:
df = pd.read_pickle('files/preprocessed.pkl')
df = df.rename(columns={"price.mainValue": "Price"}).drop('Unnamed: 0.1', axis=1)

In [107]:
SEED = 314
TEST_SIZE = 0.2
MAX_EVALS = 10

In [108]:
gs_hp_grid = {'max_depth':[4, 6, 8, 10],
              'n_estimators': [10, 15, 20, 25],
              'learning_rate': [0.2, 0.4, 0.6, 0.8],
              'gamma': [0.2, 0.4, 0.6, 0.8]
}

hyperopt_hp_grid = {'n_estimators' : hp.quniform('n_estimators', 10, 1000, 1),
             'learning_rate' : hp.loguniform('learning_rate', 0.001, 0.1),
             'max_depth' : hp.quniform('max_depth', 3, 15, 1),
             'gamma': hp.loguniform('gamma', 0.01, 1)}

In [109]:
mse_scorer = make_scorer(mean_squared_error)


In [110]:

def compute_rmse(model, features, targets):
    prediction = model.predict(features)
    rmse = np.sqrt(mean_squared_error(targets, prediction))
    return rmse

def train_grid_search(cv_parameters, features, targets):
    xgb_regressor = xgb.XGBRegressor()
    grid_search = GridSearchCV(xgb_regressor, cv_parameters, cv=5,
                               verbose=1,
                               n_jobs=4, scoring=mse_scorer)
    grid_search.fit(features, targets)
    return grid_search

In [111]:
def transform_params(params):
    params["gamma"] = np.log(params["gamma"])
    params["learning_rate"] = np.log(params["learning_rate"])
    params["n_estimators"] = int(params["n_estimators"])
    params["max_depth"] = int(params["max_depth"])
    return params

In [112]:
def loss(params):
    params = transform_params(params)
    xgb_regressor = xgb.XGBRegressor(silent=False, **params)
    cv_mse = cross_val_score(xgb_regressor, X_train, y_train,
                          cv=5, verbose=0, n_jobs=4,
                          scoring=mse_scorer)
    rmse = np.sqrt(cv_mse.mean())
    return {'loss': rmse,
            'status': STATUS_OK}
def optimize(trials, space):
    best = fmin(loss, space, algo=tpe.suggest,
                trials=trials,
                max_evals=MAX_EVALS)
    return best
def random_optimize(trials, space):
    best = fmin(loss, space, algo=rand.suggest,
                trials=trials,
                max_evals=MAX_EVALS)
    return best

In [113]:
def get_model_results(hyperparameters):
    xgb_regressor = xgb.XGBRegressor(**hyperparameters)
    mse_cv_scores = cross_val_score(xgb_regressor, X_train, y_train,
                                cv=5, verbose=0,
                                n_jobs=4, scoring=mse_scorer)
    rmse_cv_scores = np.sqrt(mse_cv_scores)
    xgb_regressor.fit(X_train, y_train)
    train_rmse = compute_rmse(xgb_regressor, X_train, y_train)
    test_rmse = compute_rmse(xgb_regressor, X_test, y_test)
    return {'optimal_hyperparameters': hyperparameters,
            'train_rmse': train_rmse,
            'mean_cv_rmse' : np.sqrt(mse_cv_scores.mean()),
            'std_cv_rmse':  mse_cv_scores.std() / float(np.sqrt(len(mse_cv_scores))),
            'test_rmse': test_rmse}

In [114]:
# df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]
from sklearn.model_selection import train_test_split
features =  list(df.drop(columns=['Price','id'],axis=1))
X = df[features]
y = df['Price']


In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [116]:
grid_search = train_grid_search(gs_hp_grid, X_train, y_train)


Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  8.2min
[Parallel(n_jobs=4)]: Done 1280 out of 1280 | elapsed:  8.4min finished


In [117]:
trials = Trials()
hyperopt_optimal_hp = optimize(trials, hyperopt_hp_grid)

100%|██████████| 10/10 [04:17<00:00, 25.73s/trial, best loss: 118063.81710633376]


In [118]:
hyperopt_optimal_hp = transform_params(hyperopt_optimal_hp)


In [119]:
def get_results_df():
    optimization_methods = ['grid_search', 'hyperopt_tpe']
    optimal_hyperparameters= [grid_search.best_params_,
                              hyperopt_optimal_hp]
    results = [get_model_results(optimal_hp) for optimal_hp in optimal_hyperparameters]
    return (pd.DataFrame(results)
              .assign(opt_method=lambda df: pd.Series(optimization_methods))
              .loc[:,
                   ['optimal_hyperparameters', 'test_rmse',
                    'mean_cv_rmse', 'std_cv_rmse',
                    'train_rmse', 'opt_method']])

In [120]:
results_df = get_results_df()


In [126]:
results_df['optimal_hyperparameters'][0]

{'gamma': 0.2, 'learning_rate': 0.8, 'max_depth': 6, 'n_estimators': 25}

In [122]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
#
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [123]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score


models = [
          DecisionTreeRegressor(criterion='mse',max_depth=11),
          GradientBoostingRegressor(n_estimators=200,max_depth=12, verbose=0),
            RandomForestRegressor(min_samples_leaf =1, n_estimators=100,criterion='mse',max_depth=20,verbose=0),
            xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0,
                             learning_rate=0.05, max_depth=6,
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2),
            lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
        ]


learning_mods = pd.DataFrame()
temp = {}

In [124]:
from sklearn.feature_selection import SelectFromModel

#run through models
for model in models:
    start = time.time()

    sfm = SelectFromModel(model, threshold=0.5)
    sfm.fit(X_train,y_train)
    Xtrain = sfm.transform(X_train)
    Xtest   = sfm.transform(X_test)

    print(model)
    m = str(model)
    model.fit(X_train, y_train)
    scores = cross_val_score(model, X_test, y_test, cv=5,scoring='r2')
    print('score on training',model.score(X_train, y_train))
    mean, std = scores.mean(), scores.std()
    print("r2 score: %0.2f (+/- %0.2f)" % (mean,std * 2),f'\nTook '
                                    f'{time.time() - start :.2f} 'f'seconds\n')



DecisionTreeRegressor(max_depth=11)
score on training 0.9830438469004884
r2 score: -0.30 (+/- 2.77) 
Took 0.33 seconds



KeyboardInterrupt: 