# Regression Trees & Random Forest Regression (apartment data)

In this notebook, we will use Regression Tree and Random Forest Regressors to predict rental apartment prices based on various features like living area, number of rooms and more. We will evaluate the models using goodness-of-fit measures like R-squared.

## Libraries and settings

In [13]:
# Libraries
import os
import numpy as np
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

/workspaces/python_machine_learning_basics/CART_RandomForest


In [25]:
# Import data from csv to pandas dataframe named 'df'
df = pd.read_csv('./Data/autoscout24_data.csv', sep=";", encoding='utf-8')

# Change the column names to lowercase
# df.columns = df.columns.str.lower()

# Display the first 5 rows of the dataframe
df.head()

# column names
list(df.columns)


['Url_Short',
 'Offer_Id',
 'Scrape_Dt',
 'Type',
 'Price_Raw',
 'HP_Raw',
 'Init_Regist',
 'Mileage_Raw',
 'Fuel_Type',
 'Transmission',
 'Dealer_Name',
 'Dealer_PLZ',
 'Dealer_City',
 'Dealer_Street_House_Nr',
 'Dealer_Telnr',
 'Init_Regist_MY',
 'Init_Regist_Month',
 'Init_Regist_Year',
 'Init_Regist_Dt']

## Import the car data

In [30]:
# Define columns for import
columns = [ 'Url_Short',
            'Offer_Id',
            'Scrape_Dt',
            'Type',
            'Price_Raw',
            'HP_Raw',
            'Init_Regist',
            'Mileage_Raw',
            'Fuel_Type',
            'Transmission',
            'Dealer_Name',
            'Dealer_PLZ',
            'Dealer_City',
            'Dealer_Street_House_Nr',
            'Dealer_Telnr',
            'Init_Regist_MY',
            'Init_Regist_Month',
            'Init_Regist_Year',
            'Init_Regist_Dt']

# Read and select columns
df_orig = pd.read_csv("./Data/autoscout24_data.csv", sep=";", encoding='utf-8')[columns]

# Rename variable 'web-scraper-order' to 'apmt_id'
df_orig = df_orig.rename(columns={'offer_id': 'id'})

# Remove missing values
df = df_orig.dropna()
df.head(5)

# Remove duplicates
df = df.drop_duplicates()

# numeric values
# df['Price_Raw'] = pd.to_numeric(df['Price_Raw'], errors='coerce')

# Remove some 'extreme' values
# df = df.loc[(df['Price_Raw'] >= 1000) & # Bereich festlegen (1000-5000)
            # (df['Price_Raw'] <= 100000)]

# Reset index
df = df.reset_index(drop=True)

print(df.shape)
df.head(5)

(3971, 19)


Unnamed: 0,Url_Short,Offer_Id,Scrape_Dt,Type,Price_Raw,HP_Raw,Init_Regist,Mileage_Raw,Fuel_Type,Transmission,Dealer_Name,Dealer_PLZ,Dealer_City,Dealer_Street_House_Nr,Dealer_Telnr,Init_Regist_MY,Init_Regist_Month,Init_Regist_Year,Init_Regist_Dt
0,https://www.autoscout24.ch/7324420,7324420,2020_03_17 07:34:45,AUDI A5 Sportback 3.0 TDI quattro S-tronic (Li...,CHF 22'500.–,245 PS,10.2014,75'000 km,Diesel,Automatisiertes Schaltgetriebe,***confidential***,8488,Turbenthal,Mettlenstrasse 3,***confidential***,10.2014,10.0,2014.0,2014-10
1,https://www.autoscout24.ch/7512768,7512768,2020_03_17 07:34:55,MERCEDES-BENZ SLK 200 7G-Tronic (Cabriolet),CHF 23'749.–,184 PS,6.2013,46'655 km,Benzin,Automat sequentiell,***confidential***,3186,Düdingen,Brugerastrasse 60,***confidential***,6.2013,6.0,2013.0,2013-06
2,https://www.autoscout24.ch/7512034,7512034,2020_03_17 07:35:03,MERCEDES-BENZ C 350 Avantgarde 4Matic 7G-Troni...,CHF 18'500.–,306 PS,6.2011,138'955 km,Benzin,Automat sequentiell,***confidential***,1262,Eysins,1262 Eysins,***confidential***,6.2011,6.0,2011.0,2011-06
3,https://www.autoscout24.ch/7512728,7512728,2020_03_17 07:35:06,MERCEDES-BENZ A 45 AMG 4Matic Speedshift 7G-DC...,CHF 36'000.–,360 PS,8.2015,43'000 km,Benzin,Automatisiertes Schaltgetriebe,***confidential***,4314,Zeiningen,Am Stutz 21,***confidential***,8.2015,8.0,2015.0,2015-08
4,https://www.autoscout24.ch/7490242,7490242,2020_03_17 07:35:16,AUDI A5 Sportback 2.0 TFSI Sport quattro S-tro...,CHF 48'500.–,252 PS,9.2018,43'300 km,Benzin,Automatisiertes Schaltgetriebe,***confidential***,3250,Lyss,3250 Lyss,***confidential***,9.2018,9.0,2018.0,2018-09


## Regression Tree
See also: https://data36.com/regression-tree-python-scikit-learn

### Create train and test samples for the regression tree (train = 80%, test = 20% of the data)

In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(df[['area', 
                                                        'rooms',
                                                        'pop_dens',
                                                        'mean_taxable_income',
                                                        'dist_supermarket']], 
                                                        df['price'], # y = price, abhängige Variable
                                                        test_size=0.20, 
                                                        random_state=42)

# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

### Fit the regression tree model

In [None]:
# Entscheidungsbaum-Regressor (Regressionsmodel das auf Entscheidungsbaum basiert) erstellt und trainiert, um Vorhersagen für einen Testdatensatz
# DecisionTreeRegressor = Machine-Learning-Modell aus der scikit-learn-Bibliothek, das verwendet wird, um Vorhersagen zu treffen.

# Create decision tree regressor object
reg = DecisionTreeRegressor(random_state=20, max_depth=3) # random_state = zufälliger Startwert, max_depth = Tiefe des Baumes auf drei Ebenen, um eine (Overfitting) zu verhindern

# Train decision tree regressor 
reg = reg.fit(X_train, y_train) 

# Predict the response for test dataset
y_pred = reg.predict(X_test)

### Calculate coefficient of determination (R-squared)

In [None]:
# Calculate coefficient of determination
print('R-squared:', round(r2_score(y_test, y_pred), 4)) # auf vier Dezimalstellen runden


### Print text representation of the regression tree

In [None]:
# Text representation of the regression tree
text_representation = tree.export_text(reg, 
                                       feature_names=list(X_train.columns))

# Print text_representation
print(text_representation)

# Die Ausgabe zeigt die Entscheidungsregeln des Baums in einer Art verschachtelten If-Else-Struktur:
# Jede Zeile stellt eine Bedingung in einem Knoten dar, z. B. wie area <= 99.50, dass der Baum sich an dieser Stelle verzweigt.
# Wenn die Bedingung erfüllt ist, folgt der Baum dem rechten Zweig (pop_dens); andernfalls geht es nach links resp. unten zum nächsten area Knotenpunkt.
# Blattknoten am Ende des Baums repräsentieren die vorhergesagten Werte für die Zielvariable.

### Vizualizing the regression tree

In [None]:
fig = plt.figure(figsize=(12,6))
_ = tree.plot_tree(reg, 
                   feature_names=list(X_train.columns),  
                   class_names=['price'], # Handelt um Regression, gibt es keine Klassen wie bei Klassifikationsmodellen, aber class_names kann eine Bezeichnung für die Zielvariable (abhängige) sein
                   filled=True, #Farbliche Hervorhebung der Knotenpunkte
                   fontsize=9,
                   label='root', #Labels für Knotenpunkte werden angezeigt
                   rounded=True) # Ecken abrunden

## Random Forest Regression
For details see: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

### Create train and test samples for the random forest (train = 80%, test = 20% of the data)

In [None]:
# Create train and test samples (the names X2_ and y2_ were used because X_ and y_ were already used above)
X2_train, X2_test, y2_train, y2_test = train_test_split(df[['area', 
                                                            'rooms',
                                                            'pop_dens',
                                                            'mean_taxable_income',
                                                            'dist_supermarket']], 
                                                            df['price'], 
                                                            test_size=0.20, 
                                                            random_state=42)

# Show X2_train
print('X2_train:')
print(X2_train.head(), '\n')

# Show y2_train
print('y2_train:')
print(y2_train.head())

### Fit the Random Forest Regression

In [None]:

reg_rf = RandomForestRegressor(n_estimators=500, # 500 Bäume im Random Forest
                               max_depth=10, # tiefe des Modells anpassen. max 10 Ebenen
                               random_state=5) # auf scikit learn -> randomforest eingeben
reg_rf.fit(X2_train, y2_train)

# Calculate coefficient of determination (R-squared)
print('R-squared:', round(reg_rf.score(X2_test, y2_test), 4))

### Show feature importance

In [None]:
cols = X2_train.columns

# Derive feature importance from random forest
importances = reg_rf.feature_importances_ # Wichtigkeit der Merkmale ermitteln
std         = np.std([tree.feature_importances_ for tree in reg_rf.estimators_], axis=0) # Standardabweichung der Werte um Stabilität zu ermitteln
indices     = np.argsort(importances)[::-1] # sortiert Indizies der Merkmale in absteigender Reihenfolge (wichtigste Merkmale zuerst)

# Print col-names and importances-values
print( cols[indices] )
print( importances[indices] )

# Barplot with feature importance
df_fi = pd.DataFrame({'features':cols,'importances': importances}) # Dataframe erstellen für Wichtigkeit der Merkmale
df_fi.sort_values('importances', inplace=True)
df_fi.plot(kind='barh', #barplot horizontal
           y='importances', 
           x='features', 
           color='darkred', 
           figsize=(6,3))

plt.show()

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')