In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn import metrics as met
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [60]:
# Selecting the DataSource
dataSource = r"C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\students_data\cleaned_data_with_IQR_removal.csv"

# Selecting columns to drop out of featureList and creating LabelList
featureDropList = ["_id", "observationDate", "state", "city", "AP_community", "community_id","postcode", "base_rent", "qm2_rent", "DE_qm2_rent"]
LabelList = ["qm2_rent"]

# Create DataFrame from DataSource
df = pd.read_csv(dataSource)
df = df[df["state"] == "Bremen"]

# Create feature and label lists
y = df[LabelList]
X = df.drop(featureDropList, axis = 1)
feature_list = list(X.columns)

y = np.array(y)
X = np.array(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [61]:
def mean_absolute_error(X, y):
    subtracted = list()
    for item1, item2 in zip(X, y):
        item = abs(item1 - item2)
        subtracted.append(item)
    mae = sum(subtracted) / len(subtracted)
    return mae

def model_10_score(model, X_test, y_test):
    predictions = model.predict(X_test)
    
    assert len(predictions) == len(y_test), 'Length of predictions is not len y_test'
    # Calculate relative prediction errors
    errors = [100 * (abs(predictions[i] - y_test[i])/ y_test[i]) for i in range((len(predictions)))]
    count_good_predictions = sum(1 for i in errors if i <= 10)
    good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)
    return good_predictions

max_10_error = make_scorer(model_10_score, greater_is_better = False)

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = [100 * (abs(predictions[i] - y_test[i])/ y_test[i]) for i in range(min(len(predictions), len(y_test)))]
    count_good_predictions = sum(1 for i in errors if i <= 10)
    good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)
    print('Percentage of predictions with less than 10 % deviation: ', good_predictions, '%.')

## Random Forest ##

In [57]:
from sklearn.ensemble import RandomForestRegressor

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start= 10, stop = 100, num = 5)]
# Number of features to consider at every split
max_features = [1.0, 50]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Random state
random_state = [0]

# Create the parameter grid based on the results of random search 
param_grid =    {
                'n_estimators': n_estimators,
                
                'random_state' : random_state
                }

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           scoring = max_10_error, cv = 3, 
                           n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [58]:
grid_search.best_params_

{'n_estimators': 10, 'random_state': 0}

In [6]:
# Number of trees in random forest
n_estimators = [50, 60, 100, 200]
# Number of features to consider at every split
max_features = [50, 75, 150]
# Maximum number of levels in tree
max_depth = [32, 50, 90]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [False]
# Random state
random_state = [0]

# Create the parameter grid based on the results of random search 
param_grid =    {
                'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap,
                'random_state' : random_state
                }

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           scoring = max_10_error, cv = 3, 
                           n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


KeyboardInterrupt: 

In [26]:
grid_search.best_params_

{'n_estimators': 10, 'random_state': 0}

In [41]:
grid_search_all_fits = pd.DataFrame(grid_search.cv_results_)
grid_search_all_fits = grid_search_all_fits.sort_values(by = ["rank_test_score"])
grid_search_all_fits

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,1.267513,0.014708,0.00869,0.000928,55,0,"{'n_estimators': 55, 'random_state': 0}",-57.54,-58.95,-55.63,-57.373333,1.360498,1
1,0.802743,0.023851,0.006668,0.000943,32,0,"{'n_estimators': 32, 'random_state': 0}",-57.19,-60.35,-55.63,-57.723333,1.963489,2
0,0.288849,0.019442,0.004668,0.000473,10,0,"{'n_estimators': 10, 'random_state': 0}",-56.84,-62.11,-54.23,-57.726667,3.277523,3
3,1.73526,0.032132,0.008693,0.00049,77,0,"{'n_estimators': 77, 'random_state': 0}",-59.65,-60.35,-58.1,-59.366667,0.940154,4
4,2.074341,0.037673,0.009673,0.000475,100,0,"{'n_estimators': 100, 'random_state': 0}",-59.3,-61.75,-58.1,-59.716667,1.518954,5


In [16]:
# Evaluating grid searched model after Hyperparameter Tuning
best_grid = grid_search.best_estimator_
evaluate_model(best_grid, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  58.41 %.


In [52]:
# Evaluating base model without Hyperparameter Tuning
base_model = RandomForestRegressor(random_state = 0, n_estimators = 32)
base_model.fit(X_train, y_train)
evaluate_model(base_model, X_test, y_test)

  base_model.fit(X_train, y_train)


Percentage of predictions with less than 10 % deviation:  65.89 %.


## XG Boost ##

In [7]:
from xgboost import XGBRegressor


# Create the parameter grid based on the results of random search 
param_grid = {
            'eta' : [0.01, 0.1, 0.3],
            'gamma': [0, 5, 80],
            'max_depth': [3, 6, 10],
            'min_child_weight': [1, 5, 23],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'reg_alpha' : [0, 0.3, 0.8],
            'reg_lambda' : [0.3, 0.8, 1],
            'n_estimators' : [100],
            'random_state' : [0],
             }


# Create a based model
xgb = XGBRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, 
                           scoring = max_10_error, cv = 3, 
                           n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6561 candidates, totalling 19683 fits


In [8]:
grid_search.best_params_

{'colsample_bytree': 0.6,
 'eta': 0.1,
 'gamma': 0,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 100,
 'random_state': 0,
 'reg_alpha': 0.8,
 'reg_lambda': 1,
 'subsample': 0.6}

In [None]:
grid_search_all_fits = pd.DataFrame(grid_search.cv_results_)
grid_search_all_fits = grid_search_all_fits.sort_values(by = ["rank_test_score"])
grid_search_all_fits

In [10]:
# Evaluating grid searched model after Hyperparameter Tuning
best_grid = grid_search.best_estimator_
evaluate_model(best_grid, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  58.9 %.


In [11]:
# Evaluating base model without Hyperparameter Tuning
base_model = XGBRegressor(random_state = 0)
base_model.fit(X_train, y_train)
evaluate_model(base_model, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  53.42 %.


## Hist Gradient ##

In [42]:
from sklearn.ensemble import HistGradientBoostingRegressor

learning_rate = [0.01, 0.1, 0.3]
max_iter = [50, 100, 200]
max_leaf_nodes = [31, 50, 100]
max_depth = [None]
min_samples_leaf = [20, 50, 100]
l2_regularization = [0, 0.5, 1]
random_state = [0]


# Create the parameter grid based on the results of random search 
param_grid = {
                    'learning_rate': learning_rate,
                    'max_iter': max_iter,
                    'max_leaf_nodes': max_leaf_nodes,
                    'max_depth': max_depth,
                    'min_samples_leaf': min_samples_leaf,
                    'l2_regularization': l2_regularization,
                    'random_state' : random_state
}

# Create a based model
hgbr = HistGradientBoostingRegressor(random_state=0)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = hgbr, param_grid = param_grid, 
                           scoring = 'neg_mean_absolute_error', cv = 3, 
                           n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


  y = column_or_1d(y, warn=True)


In [43]:
grid_search.best_params_

{'l2_regularization': 0.5,
 'learning_rate': 0.1,
 'max_depth': None,
 'max_iter': 200,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'random_state': 0}

In [None]:
grid_search_all_fits = pd.DataFrame(grid_search.cv_results_)
grid_search_all_fits = grid_search_all_fits.sort_values(by = ["rank_test_score"])
grid_search_all_fits

In [45]:
# Evaluating grid searched model after Hyperparameter Tuning
best_grid = grid_search.best_estimator_
evaluate_model(best_grid, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  65.89 %.


In [46]:
# Evaluating base model without Hyperparameter Tuning
base_model = HistGradientBoostingRegressor(random_state = 0)
base_model.fit(X_train, y_train)
evaluate_model(base_model, X_test, y_test)

  y = column_or_1d(y, warn=True)


Percentage of predictions with less than 10 % deviation:  66.82 %.


## LGBM ##

In [62]:
import lightgbm as lgb

# Number of decision leaves in a single tree
num_leaves = [31, 64, 1024]
# Maximum number of levels in tree
max_depth = [-1 , 3, 6]
# Specifies the minimum number of observations that fit the decision criteria in a leaf
min_data_in_leaf = [20, 80, 300]
# Max number of bins that feature values will be bucketed in
max_bin = [255, 300, 450]
# L1 regularization
lambda_l1 = [int(x) for x in np.linspace(start= 0, stop = 100, num = 5)]
# L2 regularization
lambda_l2 = [int(x) for x in np.linspace(start= 0, stop = 100, num = 11)]
#
min_gain_to_split = [0, 8, 13]
#
n_estimators = [int(x) for x in np.linspace(start= 0, stop = 100, num = 5)]
#
learning_rate = [0.01, 0.1, 0.3]
#
random_state =  [0]


# Create the parameter grid based on the results of random search 
param_grid = {
              "num_leaves" : num_leaves,
              "max_depth" : max_depth,
              "min_data_in_leaf" : min_data_in_leaf,
              "max_bin" : max_bin,
              "lambda_l1" : lambda_l1,
              
              "min_gain_to_split" : min_gain_to_split,
              "n_estimators" : n_estimators,
              "learning_rate" : learning_rate,
              "random_state" : random_state
             }

# Create a based model
lgbm = lgb.LGBMRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = lgbm, param_grid = param_grid, 
                           scoring = max_10_error, cv = 3, 
                           n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 18225 candidates, totalling 54675 fits


KeyboardInterrupt: 

In [38]:
grid_search.best_params_

{'lambda_l1': 0,
 'learning_rate': 0.3,
 'max_bin': 300,
 'max_depth': -1,
 'min_data_in_leaf': 20,
 'min_gain_to_split': 0,
 'n_estimators': 50,
 'num_leaves': 31,
 'random_state': 0}

In [None]:
grid_search_all_fits = pd.DataFrame(grid_search.cv_results_)
grid_search_all_fits = grid_search_all_fits.sort_values(by = ["rank_test_score"])
grid_search_all_fits

In [39]:
# Evaluating grid searched model after Hyperparameter Tuning
best_grid = grid_search.best_estimator_
evaluate_model(best_grid, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  62.15 %.


In [41]:
# Evaluating base model without Hyperparameter Tuning
base_model = lgb.LGBMRegressor(random_state = 0)
base_model.fit(X_train, y_train)
evaluate_model(base_model, X_test, y_test)

  y = column_or_1d(y, warn=True)


Percentage of predictions with less than 10 % deviation:  64.95 %.
