In [1]:
# Imports
import sys
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor

import os
import sys
module_path = os.path.abspath(os.path.join(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models.py'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tree_model_functions import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# Selecting the DataSource
dataSource = r"C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\students_data\cleaned_data_conf_with_IQR_removal.csv"

# Selecting columns to drop out of featureList and creating LabelList
featureDropList = ["_id", "observationDate", "city", "AP_community", "community_id", "base_rent", "qm2_rent", "DE_qm2_rent"]
LabelList = ["qm2_rent"]

# Create DataFrame from DataSource
try: 
    dataframe = import_data(dataSource)
except:
    dataframe = pd.read_csv(dataSource)
    

#dataframe.drop(dataframe.filter(regex = "second"), axis = 1, inplace = True)
#dataframe.drop(dataframe.filter(regex = "third"), axis = 1, inplace = True)

Memory usage of dataframe is 79.16 MB
Memory usage after optimization is: 27.23 MB
Decreased by 65.6%


In [2]:
# Remove all hyphers from states
dataframe['state'] = dataframe['state'].astype(str).apply(lambda x: x.replace('-', '')).astype(str)

In [3]:
# Split Bayern to Nordbayern and Südbayern
dataframe.loc[(dataframe['state'] == 'Bayern') & (dataframe['postcode'] > 89999), 'state'] = "BayernNORD"
dataframe.loc[(dataframe['state'] == 'Bayern') & (dataframe['postcode'] < 90000), 'state'] = "BayernSÜD"

In [4]:
for category in ["postcode"]:
    dataframe[category] = dataframe[category].astype("category")
    dataframe[category] = dataframe[category].cat.codes

In [5]:
# Creating test and trainset like this, that every state is represented 80/20 in these sets
# If train_test_split without looping throug the states first, not all states would have 80/20 representation

# Create list of unique states
states = dataframe["state"].unique()

# Create list for model scores
train_set = pd.DataFrame()
test_set = pd.DataFrame()

for state in states:
    df = dataframe
    df = df[df["state"]  == state]

    # Create feature and label lists
    y = df[LabelList]
    X = df.drop(featureDropList, axis = 1)
    feature_list = list(X.columns)

    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

    trainset = pd.concat([X_train, y_train["qm2_rent"]], axis = 1)
    testset = pd.concat([X_test, y_test["qm2_rent"]], axis = 1)

    train_set = pd.concat([train_set, trainset], axis = 0)
    test_set = pd.concat([test_set, testset], axis = 0)

X_train_all = train_set.drop("qm2_rent", axis = 1)
X_test_all = test_set.drop("qm2_rent", axis = 1)
y_train_all = test_set[["qm2_rent", "state"]]
y_test_all = test_set[["qm2_rent", "state"]]

In [6]:
# Creating dataframes with collapsible states and regions for training the models

# Westen Combinations
Westen_1 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz'])]
Westen_1.name = 'Westen_1'

Westen_2 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'NordrheinWestfalen'])]
Westen_2.name = 'Westen_2'

Westen_3 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'NordrheinWestfalen', 'Hessen'])]
Westen_3.name = 'Westen_3'

Westen_4 = train_set[train_set.state.isin(['Hessen', 'RheinlandPfalz'])]
Westen_4.name = 'Westen_4'

Westen_5 = train_set[train_set.state.isin(['NordrheinWestfalen', 'RheinlandPfalz'])]
Westen_5.name = 'Westen_5'

Westen_6 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'Hessen'])]
Westen_6.name = 'Westen_6'

Westen_7 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Hessen'])]
Westen_7.name = 'Westen_7'

Westen_8 = train_set[train_set.state.isin(['NordrheinWestfalen', 'RheinlandPfalz', 'Hessen'])]
Westen_8.name = 'Westen_8'

Westen_9 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Niedersachsen'])]
Westen_9.name = 'Westen_9'

Westen_10 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Niedersachsen', 'Bremen'])]
Westen_10.name = 'Westen_10'


# Osten Combinations
Osten_1 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'MecklenburgVorpommern', 'Thüringen'])]
Osten_1.name = 'Osten_1'

Osten_2 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'MecklenburgVorpommern'])]
Osten_2.name = 'Osten_2'

Osten_3 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin'])]
Osten_3.name = 'Osten_3'

Osten_4 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg'])]
Osten_4.name = 'Osten_4'

Osten_5 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt'])]
Osten_5.name = 'Osten_5'

Osten_6 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'MecklenburgVorpommern', 'Thüringen'])]
Osten_6.name = 'Osten_6'

Osten_7 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'MecklenburgVorpommern'])]
Osten_7.name = 'Osten_7'

Osten_8 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg'])]
Osten_8.name = 'Osten_8'

Osten_9 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Thüringen'])]
Osten_9.name = 'Osten_9'

Osten_10 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Thüringen'])]
Osten_10.name = 'Osten_10'

Osten_11 = train_set[train_set.state.isin(['SachsenAnhalt', 'Thüringen'])]
Osten_11.name = 'Osten_11'

Osten_12 = train_set[train_set.state.isin(['Brandenburg', 'Berlin'])]
Osten_12.name = 'Osten_12'

Osten_13 = train_set[train_set.state.isin(['Brandenburg', 'Mecklenburg-Vorpommern'])]
Osten_13.name = 'Osten_13'

Osten_14 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'Türingen'])]
Osten_14.name = 'Osten_14'



# Nord Deutschland Combinations
Norden_1 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen', 'Bremen', 'MecklenburgVorpommern'])]
Norden_1.name = 'Norden_1'

Norden_2 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen', 'Bremen'])]
Norden_2.name = 'Norden_2'

Norden_3 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg'])]
Norden_3.name = 'Norden_3'

Norden_4 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen'])]
Norden_4.name = 'Norden_4'

Norden_5 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Bremen'])]
Norden_5.name = 'Norden_5'

Norden_6 = train_set[train_set.state.isin(['SchleswigHolstein', 'Niedersachsen'])]
Norden_6.name = 'Norden_6'

Norden_7 = train_set[train_set.state.isin(['SchleswigHolstein', 'Niedersachsen', 'Bremen'])]
Norden_7.name = 'Norden_7'

Norden_8 = train_set[train_set.state.isin(['SchleswigHolstein', 'Bremen'])]
Norden_8.name = 'Norden_8'

Norden_9 = train_set[train_set.state.isin(['Hamburg', 'Niedersachsen', 'Bremen'])]
Norden_9.name = 'Norden_9'

Norden_10 = train_set[train_set.state.isin(['Hamburg', 'Niedersachsen'])]
Norden_10.name = 'Norden_10'

Norden_11 = train_set[train_set.state.isin(['Hamburg', 'Bremen'])]
Norden_11.name = 'Norden_11'

Norden_12 = train_set[train_set.state.isin(['Niedersachsen', 'Bremen'])]
Norden_12.name = 'Norden_12'



# Süden Combinations
Süden_1 = train_set[train_set.state.isin(['BayernNORD', 'BadenWürttemberg'])]
Süden_1.name = 'Süden_1'

Süden_2 = train_set[train_set.state.isin(['BayernSÜD', 'BadenWürttemberg'])]
Süden_2.name = 'Süden_2'

Süden_3 = train_set[train_set.state.isin(['BayernNORD', 'BayernSÜD'])]
Süden_3.name = 'Süden_3'


# Zentrum Combinations
Zentrum_1 = train_set[train_set.state.isin(['Hessen', 'Thüringen'])]
Zentrum_1.name = 'Zentrum_1'

Zentrum_2 = train_set[train_set.state.isin(['Hessen', 'Thüringen', 'BayernNORD'])]
Zentrum_2.name = 'Zentrum_2'

Zentrum_3 = train_set[train_set.state.isin(['BayernNORD', 'Thüringen'])]
Zentrum_3.name = 'Zentrum_3'

Zentrum_4 = train_set[train_set.state.isin(['Hessen', 'BayernNORD'])]
Zentrum_4.name = 'Zentrum_4'



# Always check if every df is in list !!!!

# Create lists for all combination groups
Norden_dfs = [Norden_1, Norden_2, Norden_3, Norden_4, Norden_5, Norden_6, Norden_7, Norden_8, Norden_9, Norden_10, Norden_11, Norden_12]
Osten_dfs = [Osten_1, Osten_2, Osten_3, Osten_4, Osten_5, Osten_6, Osten_7, Osten_8, Osten_9, Osten_10, Osten_11, Osten_12, Osten_13, Osten_14]
Westen_dfs = [Westen_1, Westen_2, Westen_3, Westen_4, Westen_5, Westen_6, Westen_7, Westen_8, Westen_9, Westen_10]
Süden_dfs = [Süden_1, Süden_2, Süden_3]
Zentrum_dfs = [Zentrum_1, Zentrum_2, Zentrum_3, Zentrum_4]

input_dfs_list = Norden_dfs + Osten_dfs + Westen_dfs + Süden_dfs + Zentrum_dfs

In [7]:
# Create Validation Test sets for every state

validation_sets = {}
for state in states:
    validation_sets[f'{state}_Validation_X_TestSet'] = X_test_all.loc[X_test_all["state"] == state].drop("state", axis = 1)
    validation_sets[f'{state}_Validation_y_TestSet'] = y_test_all.loc[y_test_all["state"] == state].drop("state", axis = 1)['qm2_rent'].tolist()

In [8]:
# Create validation_test_set_lists (two with keys, two with values)

validation_y_TestSets_keys = []
validation_y_TestSets_values = []

validation_X_TestSets_keys = []
validation_X_TestSets_values = []

i = 0
for key, value in validation_sets.items():
    if i % 2:
        validation_y_TestSets_keys.append(key)
        validation_y_TestSets_values.append(value)
    else:
        validation_X_TestSets_keys.append(key)
        validation_X_TestSets_values.append(value)

    i = i + 1 

In [9]:
# Create list of dataframes that are used for training the model

dataframes = {}
for state in states:
    dataframes[f'{state}'] = train_set.loc[train_set["state"] == state]

In [10]:
for df in input_dfs_list:
    dataframes[df.name] = df
    print(df.name)

Norden_1
Norden_2
Norden_3
Norden_4
Norden_5
Norden_6
Norden_7
Norden_8
Norden_9
Norden_10
Norden_11
Norden_12
Osten_1
Osten_2
Osten_3
Osten_4
Osten_5
Osten_6
Osten_7
Osten_8
Osten_9
Osten_10
Osten_11
Osten_12
Osten_13
Osten_14
Westen_1
Westen_2
Westen_3
Westen_4
Westen_5
Westen_6
Westen_7
Westen_8
Westen_9
Westen_10
Süden_1
Süden_2
Süden_3
Zentrum_1
Zentrum_2
Zentrum_3
Zentrum_4


In [11]:
# Create dataframes_lists (one with keys, one with values)

dataframes_keys = []
dataframes_values = []

for key, value in dataframes.items():
    dataframes_keys.append(key)
    dataframes_values.append(value)

In [12]:
#LÖSCHNNNNNNNNNNN

"""states = ["BayernNORD", "BayernSÜD"]
dfs = [Süden_3]
dataframes = []
dataframes = dfs
validation_X_TestSets = [BayernNORD_Validation_X_TestSet, BayernSÜD_Validation_X_TestSet]
validation_y_TestSets = [BayernNORD_Validation_y_TestSet, BayernSÜD_Validation_y_TestSet]"""

'states = ["BayernNORD", "BayernSÜD"]\ndfs = [Süden_3]\ndataframes = []\ndataframes = dfs\nvalidation_X_TestSets = [BayernNORD_Validation_X_TestSet, BayernSÜD_Validation_X_TestSet]\nvalidation_y_TestSets = [BayernNORD_Validation_y_TestSet, BayernSÜD_Validation_y_TestSet]'

In [13]:
# Create dict with best_score and best_model_list for every state

best_scores = {}
best_models = {}
for state in states:
    best_scores[f'{state}_best_score'] = 0
    best_models[f'{state}_best_model'] = []

# Create best_score_lists (one with keys, one with values) (For later tracking of best score per state)

best_scores_keys = []
best_scores_values = []

for key, value in best_scores.items():
    best_scores_keys.append(key)
    best_scores_values.append(value)


# Create best_model_lists (one with keys, one with values) (For later tracking of best model per state)

best_models_keys = []
best_models_values = []

for key, value in best_models.items():
    best_models_keys.append(key)
    best_models_values.append(value)

In [19]:

# Loop throug all states to train them seperately

# Create list for model scores
state_prediction_score = []

# Clear the scores_file
open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_scores.txt', 'w').close()

for df in dataframes_values:
    df = df

    # Create feature and label lists
    y_train = df[LabelList]
    X_train = df.drop(["qm2_rent", "state"], axis = 1)
    feature_list = list(X_train.columns)

    #y = np.array(y)
    #X = np.array(X)

    # Instantiate model
    rf = XGBRegressor(colsample_bytree = 0.6, eta = 0.1, gamma = 0, max_depth = 10, min_child_weight = 1, n_estimators = 100, random_state = 0, reg_alpha = 0.8, reg_lambda = 1, subsample = 0.6)

    # Train the model on training data
    rf.fit(X_train, y_train.values.ravel())
    
    i = 0
    

    for X, y in zip(validation_X_TestSets_values, validation_y_TestSets_values):
        
        # Use the Regressors's predict method on the test data
        predictions = rf.predict(X)
        
        assert len(predictions) == len(y), 'Length of predictions is not len y_test'
        # Calculate relative prediction errors
        errors = [100 * (abs(predictions[i] - y[i])/ y[i]) for i in range((len(predictions)))]

        # Count of predictions that are at least 10% accurate
        count_good_predictions = sum(1 for i in errors if i <= 10)

        # Proportion of good predictions for the Testset
        good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)

        state_prediction_score.append(["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i] ,good_predictions, df.shape[0]])
        

        # Compare performance of every state with every model to get best model for every state
        for state in states:
            if state == states[i]:
                if good_predictions > best_scores_values[i]:
                    best_scores_values[i] = good_predictions
                    best_models_values[i] = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]


        
        




        # Write all scores to a file
        with open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_scores.txt', 'a') as f:
            
            f.write("The model got trained on:")
            f.write('\n')
            dataframe_name = repr(df["state"].unique())
            f.write(dataframe_name)
            f.write('\n')
            f.write('\n')
            f.write("The model got evaluated with:")
            f.write('\n')
            state_name = repr(states[i])
            f.write(state_name)
            f.write('\n')
            f.write('\n')
            f.write("Model score:")
            f.write('\n')
            good_predictions = repr(good_predictions)
            f.write(good_predictions)
            f.write('\n')
            f.write('\n')
            f.write("Train data shape:")
            f.write('\n')
            train_data_shape = repr(df.shape[0])
            f.write(train_data_shape)
            f.write('\n')
            f.write('\n')
            f.write('\n')
            f.write('\n')
        
        # Add 1 to get next state
        i = i + 1


In [20]:
# Calculate weighted overall model performance

model_performance = []

i = 0
for state in states:
        model_performance.append((f'{state}_best_model', best_models_values[i], len(dataframe.loc[dataframe["state"] == state])))
        i += 1

In [21]:
# Convert state_prediction_score list into DataFrame
model_performance_df = pd.DataFrame(model_performance, columns = ["model", "score", "inserates"])

# Weighted prediction score

number_of_inserates = model_performance_df["inserates"].sum()

model_performance_df["weighted_score"] = 10
for i in range(0, len(model_performance_df)):
    model_performance_df["weighted_score"][i] = model_performance_df["score"][i][5] * model_performance_df["inserates"][i]

final_prediction_score = model_performance_df["weighted_score"].sum() / number_of_inserates

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_performance_df["weighted_score"][i] = model_performance_df["score"][i][5] * model_performance_df["inserates"][i]


In [22]:
# Write best performing model for every state to file
with open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_best_scores.txt', 'w') as f:
    
    f.write("Model performance:")
    f.write('\n')
    modelperformance = repr(final_prediction_score)
    f.write(modelperformance)
    f.write('\n')
    f.write('\n') 

    i = 0

    for state in states:       
        f.write(f'{state}:')
        f.write('\n')

        for i in range(i, i + 1):
            best_model = repr(model_performance[i])
            f.write(best_model)
            f.write('\n')
            f.write('\n')

            i += 1

In [18]:
from pprint import pprint

pprint(state_prediction_score)

[['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'SchleswigHolstein',
  72.34,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Bremen',
  21.71,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Hessen',
  28.05,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Hamburg',
  21.26,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Thüringen',
  18.07,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'NordrheinWestfalen',
  26.47,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'MecklenburgVorpommern',
  13.92,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'BayernNORD',
  31.18,
  1688],
 [