In [1]:
# Imports
import sys
import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.ensemble import HistGradientBoostingRegressor

import os
import sys
module_path = os.path.abspath(os.path.join(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models.py'))
if module_path not in sys.path:
    sys.path.append(module_path)

from tree_model_functions import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# Selecting the DataSource
dataSource = r"C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\students_data\cleaned_data_conf_with_IQR_removal.csv"

# Selecting columns to drop out of featureList and creating LabelList
featureDropList = ["_id", "observationDate", "city", "AP_community", "community_id", "base_rent", "qm2_rent", "DE_qm2_rent"]
LabelList = ["qm2_rent"]

# Create DataFrame from DataSource
try: 
    dataframe = import_data(dataSource)
except:
    dataframe = pd.read_csv(dataSource)
    

#dataframe.drop(dataframe.filter(regex = "second"), axis = 1, inplace = True)
#dataframe.drop(dataframe.filter(regex = "third"), axis = 1, inplace = True)

Memory usage of dataframe is 79.16 MB
Memory usage after optimization is: 27.23 MB
Decreased by 65.6%


In [2]:
# Remove all hyphers from states
dataframe['state'] = dataframe['state'].astype(str).apply(lambda x: x.replace('-', '')).astype(str)

In [3]:
# Split Bayern to Nordbayern and Südbayern
dataframe.loc[(dataframe['state'] == 'Bayern') & (dataframe['postcode'] > 89999), 'state'] = "BayernNORD"
dataframe.loc[(dataframe['state'] == 'Bayern') & (dataframe['postcode'] < 90000), 'state'] = "BayernSÜD"

In [4]:
for category in ["postcode"]:
    dataframe[category] = dataframe[category].astype("category")
    dataframe[category] = dataframe[category].cat.codes

In [5]:
# Creating test and trainset like this, that every state is represented 80/20 in these sets
# If train_test_split without looping throug the states first, not all states would have 80/20 representation

# Create list of unique states
states = dataframe["state"].unique()

# Create list for model scores
train_set = pd.DataFrame()
test_set = pd.DataFrame()

for state in states:
    df = dataframe
    df = df[df["state"]  == state]

    # Create feature and label lists
    y = df[LabelList]
    X = df.drop(featureDropList, axis = 1)
    feature_list = list(X.columns)

    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

    trainset = pd.concat([X_train, y_train["qm2_rent"]], axis = 1)
    testset = pd.concat([X_test, y_test["qm2_rent"]], axis = 1)

    train_set = pd.concat([train_set, trainset], axis = 0)
    test_set = pd.concat([test_set, testset], axis = 0)

X_train_all = train_set.drop("qm2_rent", axis = 1)
X_test_all = test_set.drop("qm2_rent", axis = 1)
y_train_all = test_set[["qm2_rent", "state"]]
y_test_all = test_set[["qm2_rent", "state"]]

In [6]:
states

array(['SchleswigHolstein', 'Bremen', 'Hessen', 'Hamburg', 'Thüringen',
       'NordrheinWestfalen', 'MecklenburgVorpommern', 'BayernNORD',
       'Sachsen', 'BayernSÜD', 'Brandenburg', 'Berlin', 'Saarland',
       'SachsenAnhalt', 'BadenWürttemberg', 'Niedersachsen',
       'RheinlandPfalz'], dtype=object)

In [7]:
validation_sets = {}
for state in states:
    validation_sets[f'{state}_Validation_X_TestSet'] = X_test_all.loc[X_test_all["state"] == state].drop("state", axis = 1)
    validation_sets[f'{state}_Validation_y_TestSet'] = y_test_all.loc[y_test_all["state"] == state].drop("state", axis = 1)['qm2_rent'].tolist()

In [8]:
# Creating Validation Test sets for every state

# SchleswigHolstein
SchleswigHolstein_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "SchleswigHolstein"].drop("state", axis = 1)
SchleswigHolstein_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "SchleswigHolstein"].drop("state", axis = 1)
SchleswigHolstein_Validation_y_TestSet  = SchleswigHolstein_Validation_y_TestSet['qm2_rent'].tolist()

# Bremen
Bremen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Bremen"].drop("state", axis = 1)
Bremen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Bremen"].drop("state", axis = 1)
Bremen_Validation_y_TestSet  = Bremen_Validation_y_TestSet['qm2_rent'].tolist()

# Hessen
Hessen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Hessen"].drop("state", axis = 1)
Hessen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Hessen"].drop("state", axis = 1)
Hessen_Validation_y_TestSet  = Hessen_Validation_y_TestSet['qm2_rent'].tolist()

# Hamburg
Hamburg_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Hamburg"].drop("state", axis = 1)
Hamburg_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Hamburg"].drop("state", axis = 1)
Hamburg_Validation_y_TestSet  = Hamburg_Validation_y_TestSet['qm2_rent'].tolist()

# Thüringen
Thüringen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Thüringen"].drop("state", axis = 1)
Thüringen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Thüringen"].drop("state", axis = 1)
Thüringen_Validation_y_TestSet  = Thüringen_Validation_y_TestSet['qm2_rent'].tolist()

# NordrheinWestfalen
NordrheinWestfalen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "NordrheinWestfalen"].drop("state", axis = 1)
NordrheinWestfalen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "NordrheinWestfalen"].drop("state", axis = 1)
NordrheinWestfalen_Validation_y_TestSet  = NordrheinWestfalen_Validation_y_TestSet['qm2_rent'].tolist()

# MecklenburgVorpommern
MecklenburgVorpommern_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "MecklenburgVorpommern"].drop("state", axis = 1)
MecklenburgVorpommern_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "MecklenburgVorpommern"].drop("state", axis = 1)
MecklenburgVorpommern_Validation_y_TestSet  = MecklenburgVorpommern_Validation_y_TestSet['qm2_rent'].tolist()

# BayernNORD
BayernNORD_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "BayernNORD"].drop("state", axis = 1)
BayernNORD_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "BayernNORD"].drop("state", axis = 1)
BayernNORD_Validation_y_TestSet  = BayernNORD_Validation_y_TestSet['qm2_rent'].tolist()

# BayernSÜD
BayernSÜD_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "BayernSÜD"].drop("state", axis = 1)
BayernSÜD_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "BayernSÜD"].drop("state", axis = 1)
BayernSÜD_Validation_y_TestSet  = BayernSÜD_Validation_y_TestSet['qm2_rent'].tolist()

# Sachsen
Sachsen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Sachsen"].drop("state", axis = 1)
Sachsen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Sachsen"].drop("state", axis = 1)
Sachsen_Validation_y_TestSet  = Sachsen_Validation_y_TestSet['qm2_rent'].tolist()

# Brandenburg
Brandenburg_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Brandenburg"].drop("state", axis = 1)
Brandenburg_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Brandenburg"].drop("state", axis = 1)
Brandenburg_Validation_y_TestSet  = Brandenburg_Validation_y_TestSet['qm2_rent'].tolist()

# Berlin
Berlin_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Berlin"].drop("state", axis = 1)
Berlin_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Berlin"].drop("state", axis = 1)
Berlin_Validation_y_TestSet  = Berlin_Validation_y_TestSet['qm2_rent'].tolist()

# Saarland
Saarland_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Saarland"].drop("state", axis = 1)
Saarland_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Saarland"].drop("state", axis = 1)
Saarland_Validation_y_TestSet  = Saarland_Validation_y_TestSet['qm2_rent'].tolist()

# SachsenAnhalt
SachsenAnhalt_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "SachsenAnhalt"].drop("state", axis = 1)
SachsenAnhalt_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "SachsenAnhalt"].drop("state", axis = 1)
SachsenAnhalt_Validation_y_TestSet  = SachsenAnhalt_Validation_y_TestSet['qm2_rent'].tolist()

# BadenWürttemberg
BadenWürttemberg_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "BadenWürttemberg"].drop("state", axis = 1)
BadenWürttemberg_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "BadenWürttemberg"].drop("state", axis = 1)
BadenWürttemberg_Validation_y_TestSet  = BadenWürttemberg_Validation_y_TestSet['qm2_rent'].tolist()

# Niedersachsen
Niedersachsen_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "Niedersachsen"].drop("state", axis = 1)
Niedersachsen_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "Niedersachsen"].drop("state", axis = 1)
Niedersachsen_Validation_y_TestSet  = Niedersachsen_Validation_y_TestSet['qm2_rent'].tolist()

# RheinlandPfalz
RheinlandPfalz_Validation_X_TestSet = X_test_all.loc[X_test_all["state"] == "RheinlandPfalz"].drop("state", axis = 1)
RheinlandPfalz_Validation_y_TestSet = y_test_all.loc[y_test_all["state"] == "RheinlandPfalz"].drop("state", axis = 1)
RheinlandPfalz_Validation_y_TestSet = RheinlandPfalz_Validation_y_TestSet['qm2_rent'].tolist()


In [9]:
# These lists have to have the same order like the list 'states' !!!!!

validation_X_TestSets = [SchleswigHolstein_Validation_X_TestSet, Bremen_Validation_X_TestSet, Hessen_Validation_X_TestSet, Hamburg_Validation_X_TestSet, Thüringen_Validation_X_TestSet, NordrheinWestfalen_Validation_X_TestSet, MecklenburgVorpommern_Validation_X_TestSet, BayernNORD_Validation_X_TestSet, Sachsen_Validation_X_TestSet, BayernSÜD_Validation_X_TestSet, Brandenburg_Validation_X_TestSet, Berlin_Validation_X_TestSet, Saarland_Validation_X_TestSet, SachsenAnhalt_Validation_X_TestSet, BadenWürttemberg_Validation_X_TestSet, Niedersachsen_Validation_X_TestSet, RheinlandPfalz_Validation_X_TestSet]
validation_y_TestSets = [SchleswigHolstein_Validation_y_TestSet, Bremen_Validation_y_TestSet, Hessen_Validation_y_TestSet, Hamburg_Validation_y_TestSet, Thüringen_Validation_y_TestSet, NordrheinWestfalen_Validation_y_TestSet, MecklenburgVorpommern_Validation_y_TestSet, BayernNORD_Validation_y_TestSet, Sachsen_Validation_y_TestSet, BayernSÜD_Validation_y_TestSet, Brandenburg_Validation_y_TestSet, Berlin_Validation_y_TestSet, Saarland_Validation_y_TestSet, SachsenAnhalt_Validation_y_TestSet, BadenWürttemberg_Validation_y_TestSet, Niedersachsen_Validation_y_TestSet, RheinlandPfalz_Validation_y_TestSet]

In [10]:
states

array(['SchleswigHolstein', 'Bremen', 'Hessen', 'Hamburg', 'Thüringen',
       'NordrheinWestfalen', 'MecklenburgVorpommern', 'BayernNORD',
       'Sachsen', 'BayernSÜD', 'Brandenburg', 'Berlin', 'Saarland',
       'SachsenAnhalt', 'BadenWürttemberg', 'Niedersachsen',
       'RheinlandPfalz'], dtype=object)

In [11]:
# Creating dataframes for every state

# No Combinations
Saarland = train_set.loc[train_set["state"] == "Saarland"]
RheinlandPfalz = train_set.loc[train_set["state"] == "RheinlandPfalz"]
BayernNORD = train_set.loc[train_set["state"] == "BayernNORD"]
BayernSÜD = train_set.loc[train_set["state"] == "BayernSÜD"]
Sachsen = train_set.loc[train_set["state"] == "Sachsen"]
SachsenAnhalt = train_set.loc[train_set["state"] == "SachsenAnhalt"]
SchleswigHolstein = train_set.loc[train_set["state"] == "SchleswigHolstein"]
Bremen = train_set.loc[train_set["state"] == "Bremen"]
Hessen = train_set.loc[train_set["state"] == "Hessen"]
Hamburg = train_set.loc[train_set["state"] == "Hamburg"]
Thüringen = train_set.loc[train_set["state"] == "Thüringen"]
NordrheinWestfalen = train_set.loc[train_set["state"] == "NordrheinWestfalen"]
MecklenburgVorpommern = train_set.loc[train_set["state"] == "MecklenburgVorpommern"]
Brandenburg = train_set.loc[train_set["state"] == "Brandenburg"]
Berlin = train_set.loc[train_set["state"] == "Berlin"]
BadenWürttemberg = train_set.loc[train_set["state"] == "BadenWürttemberg"]
Niedersachsen = train_set.loc[train_set["state"] == "Niedersachsen"]


In [12]:
# Creating dataframes with collapsible states and regions for training the models

# Westen Combinations
Westen_1 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz'])]
Westen_2 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'NordrheinWestfalen'])]
Westen_3 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'NordrheinWestfalen', 'Hessen'])]
Westen_4 = train_set[train_set.state.isin(['Hessen', 'RheinlandPfalz'])]
Westen_5 = train_set[train_set.state.isin(['NordrheinWestfalen', 'RheinlandPfalz'])]
Westen_6 = train_set[train_set.state.isin(['Saarland', 'RheinlandPfalz', 'Hessen'])]
Westen_7 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Hessen'])]
Westen_8 = train_set[train_set.state.isin(['NordrheinWestfalen', 'RheinlandPfalz', 'Hessen'])]
Westen_9 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Niedersachsen'])]
Westen_10 = train_set[train_set.state.isin(['NordrheinWestfalen', 'Niedersachsen', 'Bremen'])]

# Osten Combinations
Osten_1 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'MecklenburgVorpommern', 'Thüringen'])]
Osten_2 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'MecklenburgVorpommern'])]
Osten_3 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin'])]
Osten_4 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg'])]
Osten_5 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt'])]
Osten_6 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'MecklenburgVorpommern', 'Thüringen'])]
Osten_7 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'MecklenburgVorpommern'])]
Osten_8 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg'])]
Osten_9 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Thüringen'])]
Osten_10 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Thüringen'])]
Osten_11 = train_set[train_set.state.isin(['SachsenAnhalt', 'Thüringen'])]
Osten_12 = train_set[train_set.state.isin(['Brandenburg', 'Berlin'])]
Osten_13 = train_set[train_set.state.isin(['Brandenburg', 'Mecklenburg-Vorpommern'])]
Osten_14 = train_set[train_set.state.isin(['Sachsen', 'SachsenAnhalt', 'Brandenburg', 'Berlin', 'Türingen'])]


# Nord Deutschland Combinations
Norden_1 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen', 'Bremen', 'MecklenburgVorpommern'])]
Norden_2 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen', 'Bremen'])]
Norden_3 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg'])]
Norden_4 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Niedersachsen'])]
Norden_5 = train_set[train_set.state.isin(['SchleswigHolstein', 'Hamburg', 'Bremen'])]
Norden_6 = train_set[train_set.state.isin(['SchleswigHolstein', 'Niedersachsen'])]
Norden_7 = train_set[train_set.state.isin(['SchleswigHolstein', 'Niedersachsen', 'Bremen'])]
Norden_8 = train_set[train_set.state.isin(['SchleswigHolstein', 'Bremen'])]
Norden_9 = train_set[train_set.state.isin(['Hamburg', 'Niedersachsen', 'Bremen'])]
Norden_10 = train_set[train_set.state.isin(['Hamburg', 'Niedersachsen'])]
Norden_11 = train_set[train_set.state.isin(['Hamburg', 'Bremen'])]
Norden_12 = train_set[train_set.state.isin(['Niedersachsen', 'Bremen'])]


# Süden Combinations
Süden_1 = train_set[train_set.state.isin(['BayernNORD', 'BadenWürttemberg'])]
Süden_2 = train_set[train_set.state.isin(['BayernSÜD', 'BadenWürttemberg'])]
Süden_3 = train_set[train_set.state.isin(['BayernNORD', 'BayernSÜD'])]

# Zentrum Combinations
Zentrum_1 = train_set[train_set.state.isin(['Hessen', 'Thüringen'])]
Zentrum_2 = train_set[train_set.state.isin(['Hessen', 'Thüringen', 'BayernNORD'])]
Zentrum_3 = train_set[train_set.state.isin(['BayernNORD', 'Thüringen'])]
Zentrum_4 = train_set[train_set.state.isin(['Hessen', 'BayernNORD'])]

In [13]:
# Always check if every df is in list !!!!

# Create lists for all combination groups
Norden_dfs = [Norden_1, Norden_2, Norden_3, Norden_4, Norden_5, Norden_6, Norden_7, Norden_8, Norden_9, Norden_10, Norden_11, Norden_12]
Osten_dfs = [Osten_1, Osten_2, Osten_3, Osten_4, Osten_5, Osten_6, Osten_7, Osten_8, Osten_9, Osten_10, Osten_11, Osten_12, Osten_13, Osten_14]
Westen_dfs = [Westen_1, Westen_2, Westen_3, Westen_4, Westen_5, Westen_6, Westen_7, Westen_8, Westen_9, Westen_10]
Süden_dfs = [Süden_1, Süden_2, Süden_3]
Zentrum_dfs = [Zentrum_1, Zentrum_2, Zentrum_3, Zentrum_4]

In [14]:
# Create list of dataframes that are used for training the model
dataframes = []

# Put in all states
dataframes = [SchleswigHolstein, Bremen, Hessen, Hamburg, Thüringen, NordrheinWestfalen, MecklenburgVorpommern, BayernNORD, BayernSÜD, Sachsen, Brandenburg, Berlin, Saarland, SachsenAnhalt, BadenWürttemberg, Niedersachsen, RheinlandPfalz]

# Append collapsible states and regions
dataframes = dataframes + Norden_dfs + Osten_dfs + Westen_dfs + Süden_dfs + Zentrum_dfs

In [15]:
#LÖSCHNNNNNNNNNNN

"""states = ["BayernNORD", "BayernSÜD"]
dfs = [Süden_3]
dataframes = []
dataframes = dfs
validation_X_TestSets = [BayernNORD_Validation_X_TestSet, BayernSÜD_Validation_X_TestSet]
validation_y_TestSets = [BayernNORD_Validation_y_TestSet, BayernSÜD_Validation_y_TestSet]"""

'states = ["BayernNORD", "BayernSÜD"]\ndfs = [Süden_3]\ndataframes = []\ndataframes = dfs\nvalidation_X_TestSets = [BayernNORD_Validation_X_TestSet, BayernSÜD_Validation_X_TestSet]\nvalidation_y_TestSets = [BayernNORD_Validation_y_TestSet, BayernSÜD_Validation_y_TestSet]'

In [16]:
Saarland_best_score = 0
Saarland_best_model = []

RheinlandPfalz_best_score = 0
RheinlandPfalz_best_model = []

BayernNORD_best_score = 0
BayernNORD_best_model = []

BayernSÜD_best_score = 0
BayernSÜD_best_model = []

Sachsen_best_score = 0
Sachsen_best_model = []

SachsenAnhalt_best_score = 0
SachsenAnhalt_best_model = []

SchleswigHolstein_best_score = 0
SchleswigHolstein_best_model = []

Bremen_best_score = 0
Bremen_best_model = []

Hessen_best_score = 0
Hessen_best_model = []

Hamburg_best_score = 0
Hamburg_best_model = []

Thüringen_best_score = 0
Thüringen_best_model = []

NordrheinWestfalen_best_score = 0
NordrheinWestfalen_best_model = []

MecklenburgVorpommern_best_score = 0
MecklenburgVorpommern_best_model = []

Brandenburg_best_score = 0
Brandenburg_best_model = []

Berlin_best_score = 0
Berlin_best_model = []

BadenWürttemberg_best_score = 0
BadenWürttemberg_best_model = []

Niedersachsen_best_score = 0
Niedersachsen_best_model = []

In [17]:
######
####
###
##
#
# Grid Search MUSS optimiert werden, zb Schleswig Holstein hat bei random Forset default modell mit random_state = 0 über 80% performance!!!!
#
##
###
####
#####

In [18]:

# Loop throug all states to train them seperately

# Create list for model scores
state_prediction_score = []

# Clear the scores_file
open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_scores.txt', 'w').close()

for df in dataframes:
    df = df

    # Create feature and label lists
    y_train = df[LabelList]
    X_train = df.drop(["qm2_rent", "state"], axis = 1)
    feature_list = list(X_train.columns)

    #y = np.array(y)
    #X = np.array(X)

    # Instantiate model
    rf = RandomForestRegressor(bootstrap=False, max_depth=None, max_features=50, n_estimators=100, random_state = 0, min_samples_leaf= 1,min_samples_split= 2)


    # Train the model on training data
    rf.fit(X_train, y_train.values.ravel())
    
    i = 0
    

    for X, y in zip(validation_X_TestSets, validation_y_TestSets):
        
        # Use the Regressors's predict method on the test data
        predictions = rf.predict(X)
        
        assert len(predictions) == len(y), 'Length of predictions is not len y_test'
        # Calculate relative prediction errors
        errors = [100 * (abs(predictions[i] - y[i])/ y[i]) for i in range((len(predictions)))]

        # Count of predictions that are at least 10% accurate
        count_good_predictions = sum(1 for i in errors if i <= 10)

        # Proportion of good predictions for the Testset
        good_predictions = round(np.mean(100 * (count_good_predictions / len(errors))), 2)

        state_prediction_score.append(["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i] ,good_predictions, df.shape[0]])
        




        # Compare performance of every state with every model to get best model for every state
        if "SchleswigHolstein" == states[i]:
            if good_predictions > SchleswigHolstein_best_score:
                SchleswigHolstein_best_score = good_predictions
                SchleswigHolstein_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]
        
        if "Bremen" == states[i]:
            if good_predictions > Bremen_best_score:
                Bremen_best_score = good_predictions
                Bremen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Hessen" == states[i]:
            if good_predictions > Hessen_best_score:
                Hessen_best_score = good_predictions
                Hessen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Hamburg" == states[i]:
            if good_predictions > Hamburg_best_score:
                Hamburg_best_score = good_predictions
                Hamburg_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Thüringen" == states[i]:
            if good_predictions > Thüringen_best_score:
                Thüringen_best_score = good_predictions
                Thüringen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "NordrheinWestfalen" == states[i]:
            if good_predictions > NordrheinWestfalen_best_score:
                NordrheinWestfalen_best_score = good_predictions
                NordrheinWestfalen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "MecklenburgVorpommern" == states[i]:
            if good_predictions > MecklenburgVorpommern_best_score:
                MecklenburgVorpommern_best_score = good_predictions
                MecklenburgVorpommern_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "BayernNORD" == states[i]:
            if good_predictions > BayernNORD_best_score:
                BayernNORD_best_score = good_predictions
                BayernNORD_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "BayernSÜD" == states[i]:
            if good_predictions > BayernSÜD_best_score:
                BayernSÜD_best_score = good_predictions
                BayernSÜD_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Sachsen" == states[i]:
            if good_predictions > Sachsen_best_score:
                Sachsen_best_score = good_predictions
                Sachsen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Brandenburg" == states[i]:
            if good_predictions > Brandenburg_best_score:
                Brandenburg_best_score = good_predictions
                Brandenburg_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]
        
        if "Berlin" == states[i]:
            if good_predictions > Berlin_best_score:
                Berlin_best_score = good_predictions
                Berlin_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]
                
        if "Saarland" == states[i]:
            if good_predictions > Saarland_best_score:
                Saarland_best_score = good_predictions
                Saarland_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "SachsenAnhalt" == states[i]:
            if good_predictions > SachsenAnhalt_best_score:
                SachsenAnhalt_best_score = good_predictions
                SachsenAnhalt_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "BadenWürttemberg" == states[i]:
            if good_predictions > BadenWürttemberg_best_score:
                BadenWürttemberg_best_score = good_predictions
                BadenWürttemberg_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]

        if "Niedersachsen" == states[i]:
            if good_predictions > Niedersachsen_best_score:
                Niedersachsen_best_score = good_predictions
                Niedersachsen_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]
                
        if "RheinlandPfalz" == states[i]:
            if good_predictions > RheinlandPfalz_best_score:
                RheinlandPfalz_best_score = good_predictions
                RheinlandPfalz_best_model = ["Prediction on dataframe: " ,df["state"].unique().tolist(), "Evaluating with Dataframe: ", states[i], "Prediction score on test data: ", good_predictions, "Number of rows of training data: ", df.shape[0]]





        # Write all scores to a file
        with open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_scores.txt', 'a') as f:
            
            f.write("The model got trained on:")
            f.write('\n')
            dataframe_name = repr(df["state"].unique())
            f.write(dataframe_name)
            f.write('\n')
            f.write('\n')
            f.write("The model got evaluated with:")
            f.write('\n')
            state_name = repr(states[i])
            f.write(state_name)
            f.write('\n')
            f.write('\n')
            f.write("Model score:")
            f.write('\n')
            good_predictions = repr(good_predictions)
            f.write(good_predictions)
            f.write('\n')
            f.write('\n')
            f.write("Train data shape:")
            f.write('\n')
            train_data_shape = repr(df.shape[0])
            f.write(train_data_shape)
            f.write('\n')
            f.write('\n')
            f.write('\n')
            f.write('\n')
        
        # Add 1 to get next state
        i = i + 1




# Calculate weighted overall model performance

model_performance = []


model_performance.append((BayernNORD_best_model[5], len(dataframe.loc[dataframe["state"] == "BayernNORD"])))
model_performance.append((BayernSÜD_best_model[5], len(dataframe.loc[dataframe["state"] == "BayernSÜD"])))
model_performance.append((Sachsen_best_model[5], len(dataframe.loc[dataframe["state"] == "Sachsen"])))
model_performance.append((Brandenburg_best_model[5], len(dataframe.loc[dataframe["state"] == "Brandenburg"])))
model_performance.append((Berlin_best_model[5], len(dataframe.loc[dataframe["state"] == "Berlin"])))
model_performance.append((Saarland_best_model[5], len(dataframe.loc[dataframe["state"] == "Saarland"])))
model_performance.append((SachsenAnhalt_best_model[5], len(dataframe.loc[dataframe["state"] == "SachsenAnhalt"])))
model_performance.append((BadenWürttemberg_best_model[5], len(dataframe.loc[dataframe["state"] == "BadenWürttemberg"])))
model_performance.append((Niedersachsen_best_model[5], len(dataframe.loc[dataframe["state"] == "Niedersachsen"])))
model_performance.append((RheinlandPfalz_best_model[5], len(dataframe.loc[dataframe["state"] == "RheinlandPfalz"])))

# Convert state_prediction_score list into DataFrame
model_performance_df = pd.DataFrame(model_performance, columns = ["score", "inserates"])

# Weighted prediction score

number_of_inserates = model_performance_df["inserates"].sum()

model_performance_df["weighted_score"] = model_performance_df["score"] * model_performance_df["inserates"]

prediction_score = model_performance_df["weighted_score"].sum() / number_of_inserates







# Write best performing model for every state to file
with open(r'C:\Users\soube\OneDrive\Desktop\Hammudi\Bachelorarbeit\Repository\AP-rent-determination\tree_models\randomForest_best_scores.txt', 'w') as f:
    f.write("Model performance:")
    f.write('\n')
    modelperformance = repr(prediction_score)
    f.write(modelperformance)
    f.write('\n')
    f.write('\n')          
    f.write("SchleswigHolstein:")
    f.write('\n')
    best_model = repr(SchleswigHolstein_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Bremen:")
    f.write('\n')
    best_model = repr(Bremen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Hessen:")
    f.write('\n')
    best_model = repr(Hessen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Hamburg:")
    f.write('\n')
    best_model = repr(Hamburg_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Thüringen:")
    f.write('\n')
    best_model = repr(Thüringen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("NordrheinWestfalen:")
    f.write('\n')
    best_model = repr(NordrheinWestfalen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("MecklenburgVorpommern:")
    f.write('\n')
    best_model = repr(MecklenburgVorpommern_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("BayernNORD:")
    f.write('\n')
    best_model = repr(BayernNORD_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("BayernSÜD:")
    f.write('\n')
    best_model = repr(BayernSÜD_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Sachsen:")
    f.write('\n')
    best_model = repr(Sachsen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Brandenburg:")
    f.write('\n')
    best_model = repr(Brandenburg_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Berlin:")
    f.write('\n')
    best_model = repr(Berlin_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Saarland:")
    f.write('\n')
    best_model = repr(Saarland_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("SachsenAnhalt:")
    f.write('\n')
    best_model = repr(SachsenAnhalt_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("BadenWürttemberg:")
    f.write('\n')
    best_model = repr(BadenWürttemberg_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("Niedersachsen:")
    f.write('\n')
    best_model = repr(Niedersachsen_best_model)
    f.write(best_model)
    f.write('\n')
    f.write('\n')
    f.write("RheinlandPfalz:")
    f.write('\n')
    best_model = repr(RheinlandPfalz_best_model)
    f.write(best_model)

In [19]:
######
####
###
##
#
# Grid Search MUSS optimiert werden, zb Schleswig Holstein hat bei random Forset default modell mit random_state = 0 über 80% performance!!!!
#
##
###
####
#####

In [20]:
from pprint import pprint

pprint(state_prediction_score)

[['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'SchleswigHolstein',
  75.65,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Bremen',
  31.78,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Hessen',
  34.01,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Hamburg',
  23.86,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'Thüringen',
  9.65,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'NordrheinWestfalen',
  15.15,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'MecklenburgVorpommern',
  6.33,
  1688],
 ['Prediction on dataframe: ',
  ['SchleswigHolstein'],
  'Evaluating with Dataframe: ',
  'BayernNORD',
  37.64,
  1688],
 ['P