# Classification with RandomForest
## Steps
  1. Introduction and Use-Case
  2. Dataset and preparation
  3. Classification
  4. Further Look and Conclusion

# Code (Data preprocessing)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install dependencies
!pip install --user shapely
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=3f32eb6f27738229936e7d1a4cef69cceb91a0bf6244e764364d26c5733c9121
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
# PySpark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [None]:
# Load Data
df_merged_nonan = spark.read.csv('/content/drive/MyDrive/DA_Data/Merged_Cleaned_Data/merged_featureset_nonan.csv', header=True)
df_merged_with_nan = spark.read.csv('/content/drive/MyDrive/DA_Data/Merged_Cleaned_Data/merged_featureset_with_nan.csv', header=True)

df_merged_nonan.printSchema()
df_merged_nonan.show()
#df_merged_with_nan.printSchema()

root
 |-- Stadtteil: string (nullable = true)
 |-- Mietpreis pro qm: string (nullable = true)
 |-- Gruenflaeche: string (nullable = true)
 |-- AQI: string (nullable = true)
 |-- Fahrradhausanzahl: string (nullable = true)
 |-- Flaeche: string (nullable = true)
 |-- Anzahl_Punkte: string (nullable = true)

+---------------+-----------------+------------------+------------------+-----------------+------------------+-------------+
|      Stadtteil| Mietpreis pro qm|      Gruenflaeche|               AQI|Fahrradhausanzahl|           Flaeche|Anzahl_Punkte|
+---------------+-----------------+------------------+------------------+-----------------+------------------+-------------+
|      Allermöhe| 9.67255494038344|           64.5282| 2.211251302234055|              0.0| 8.645889439254942|          0.0|
|     Alsterdorf|11.17867462862816|           43.2205|  2.16680208937899|              0.0| 3.152300871916889|       4698.0|
|     Altengamme| 9.47397938223054|            1.2187| 2.21125130223

In [None]:
# Checking Data

#district_name = "Barmbek Süd"

# Filter DataFrame for the specified district
#barmbek_sud_data = merged_social_data.filter(col("Stadtteil") == district_name)

# Show the DataFrame
#barmbek_sud_data.show()


In [None]:
# Cast Data to Double
from pyspark.sql.types import *
from pyspark.sql.functions import col

# Double
df_merged_nonan = df_merged_nonan.withColumn("Mietpreis pro qm", df_merged_nonan["Mietpreis pro qm"].cast(DoubleType()))
df_merged_nonan = df_merged_nonan.withColumn("Gruenflaeche", df_merged_nonan["Mietpreis pro qm"].cast(DoubleType()))
df_merged_nonan = df_merged_nonan.withColumn("AQI", df_merged_nonan["AQI"].cast(DoubleType()))
df_merged_nonan = df_merged_nonan.withColumn("Flaeche", df_merged_nonan["Flaeche"].cast(DoubleType()))

#Integer
df_merged_nonan = df_merged_nonan.withColumn("Fahrradhausanzahl", df_merged_nonan["Fahrradhausanzahl"].cast(IntegerType()))
df_merged_nonan = df_merged_nonan.withColumn("Anzahl_Punkte", df_merged_nonan["Anzahl_Punkte"].cast(IntegerType()))

# To Pandas
mer_nonan = df_merged_nonan.toPandas()

## DataSet Preparation (Test, Train Data Set and Labeling)

In [None]:
# Dependencies

# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler

In [1]:
# Labeling
le = LabelEncoder()
mer_nonan['Stadtteil'] = le.fit_transform(mer_nonan['Stadtteil'])

# Split the data into features (X) and target (y)
#features_to_drop = ['Fahrradhausanzahl', 'Flaeche', 'Gruenflaeche','AQI']
X = mer_nonan.drop(['Stadtteil'], axis=1)  # Assuming 'Stadtteil' is the target
#X = mer_nonan.drop(['Stadtteil'] + features_to_drop, axis=1)  # Assuming 'Stadtteil' is the target
y = mer_nonan['Stadtteil']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(y_train.value_counts())

NameError: name 'LabelEncoder' is not defined

In [None]:
from ctypes import c_int64
from sklearn.ensemble import *
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from itertools import product


# Preprocessing with StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Model
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)
###############################################################################
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # Specify the average parameter
recall = recall_score(y_test, y_pred, average='macro')  # Specify the average parameter

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


# Check the distribution of predicted classes
print("Predicted Class Distribution:")
print(pd.Series(y_pred).value_counts())

# Explore the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Display feature importances
#feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
#print("Feature Importances:")
#print(feature_importances)

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

# Export the first three decision trees from the forest

for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(tree,
                               feature_names=X_train.columns,
                               filled=True,
                               max_depth=2,
                               impurity=False,
                               proportion=True)
    graph = graphviz.Source(dot_data)
    display(graph)

Accuracy: 0.0
Precision: 0.0
Recall: 0.0
Predicted Class Distribution:
10     3
94     2
49     2
17     1
73     1
58     1
24     1
41     1
43     1
101    1
18     1
25     1
3      1
2      1
39     1
64     1
71     1
dtype: int64
Confusion Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

# Why its not working - Ideas

- Data Exploration
  - data is imbalanced -> oversampling, undersampling, different weights
- Data Preprocessing
  - missing values, outliers, feature scaling
- Model parameters
  - n_estimators, max_depth
- Use other classifier

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import StratifiedKFold
param_dist = {'n_estimators': randint(50,1000),
              'max_depth': randint(1,20)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf,
                                 param_distributions = param_dist,
                                 n_iter=20,
                                 cv=2)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

#Evaluation


In [2]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)

ConfusionMatrixDisplay(confusion_matrix=cm).plot();

NameError: name 'confusion_matrix' is not defined