In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from sklearn.svm import SVR
from sklearn import metrics as met
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Selecting the DataSource
dataSource = r"C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\students_data\cleaned_data_conf_with_IQR_removal.csv"

# Selecting columns to drop out of featureList and creating LabelList
featureDropList = ["_id", "observationDate", "state", "city", "AP_community", "community_id","postcode", "base_rent", "qm2_rent", "DE_qm2_rent"]
LabelList = ["qm2_rent"]

# Create DataFrame from DataSource
df = pd.read_csv(dataSource)
df = df[df["state"] == "Sachsen-Anhalt"]

# Create feature and label lists
y = df[LabelList]
X = df.drop(featureDropList, axis = 1)
feature_list = list(X.columns)

y = np.array(y)
X = np.array(X)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


def mean_absolute_error(X, y):
    subtracted = list()
    for item1, item2 in zip(X, y):
        item = abs(item1 - item2)
        subtracted.append(item)
    mae = sum(subtracted) / len(subtracted)
    return mae

def model_10_score(predictions, y_test):

    assert len(predictions) == len(y_test), 'Length of predictions is not len y_test'
    errors = [100 * (abs(predictions[i] - y_test[i])/ y_test[i]) for i in range((len(predictions)))]
    count_good_predictions = sum(1 for i in errors if i <= 10)
    good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)
    return good_predictions

max_10_error = make_scorer(model_10_score, greater_is_better = True)

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    errors = [100 * (abs(predictions[i] - y_test[i])/ y_test[i]) for i in range(min(len(predictions), len(y_test)))]
    count_good_predictions = sum(1 for i in errors if i <= 10)
    good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)
    print('Percentage of predictions with less than 10 % deviation: ', good_predictions, '%.')

In [42]:
# Number of trees in random forest
n_estimators = [100, 200]
# Number of features to consider at every split
max_features = [1, 50, 75]
# Maximum number of levels in tree
max_depth = [32, 50, 90]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [False, True]
# Random state
random_state = [0]

# Create the parameter grid based on the results of random search 
param_grid =    {
                'n_estimators': n_estimators,
                'random_state' : random_state
                }

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           scoring = 'neg_mean_absolute_percentage_error', cv = 3, 
                           n_jobs = -1, verbose = 10)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 2 candidates, totalling 6 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [43]:
# Evaluating grid searched model after Hyperparameter Tuning
best_grid = grid_search.best_estimator_
evaluate_model(best_grid, X_test, y_test)

Percentage of predictions with less than 10 % deviation:  79.63 %.


In [44]:
grid_search.best_params_

{'n_estimators': 200, 'random_state': 0}

In [45]:
grid_search_all_fits = pd.DataFrame(grid_search.cv_results_)
grid_search_all_fits = grid_search_all_fits.sort_values(by = ["rank_test_score"])
grid_search_all_fits

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
1,40.999508,1.158667,0.074685,0.0033,200,0,"{'n_estimators': 200, 'random_state': 0}",-0.068123,-0.072419,-0.078622,-0.073055,0.00431,1
0,20.067281,0.410974,0.043598,0.003985,100,0,"{'n_estimators': 100, 'random_state': 0}",-0.068527,-0.073223,-0.079301,-0.073683,0.004411,2


In [46]:
# Evaluating base model without Hyperparameter Tuning
base_model = RandomForestRegressor(random_state = 0)
base_model.fit(X_train, y_train)
evaluate_model(base_model, X_test, y_test)

  base_model.fit(X_train, y_train)


Percentage of predictions with less than 10 % deviation:  80.19 %.
