# Decision Tree

## Loading the dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree

In [2]:
df = pd.read_csv('Data/cleaned_data.csv')

In [3]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters,DiffNumberOFCharacters,Saved,Country,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,Finance_access,ICT,Industry_activity,Overall_index,Research_and_development,Skills,Total,Males,Females,Passengers,Pedestrians
0,222HpiEf2LtAwEg62,-1232628507_1597557389,1597557000.0,0,1,0,Female,Gender,2,0,0,UKR,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.75,0.5,0.65,0.65,0.5,0.65,13.25,21.75,5.95,7.412,5.6984
1,222KuWty7pNeiv77a,1654911454_3639764894860440.0,3639765000000000.0,1,0,0,Low,Social Status,2,0,0,USA,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0.9,0.65,0.8,1.0,1.0,0.75,12.5,17.85,7.25,3.9603,1.9737
2,222LDp4wz24C3chzj,-1679158262_3623236506.0,3623237000.0,0,0,0,Fat,Fitness,2,0,0,DEU,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.8,0.8,0.9,0.9,0.8,0.75,4.2,6.35,2.15,1.912,0.612
3,222dZwp7jYt7FrkfQ,781757349_7305361930957958.0,7305362000000000.0,1,0,0,Male,Gender,1,0,0,USA,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.9,0.65,0.8,1.0,1.0,0.75,12.5,17.85,7.25,3.9603,1.9737
4,222fkCAzoe6MAnMsP,-624226515_2260272466.0,2260272000.0,0,0,0,Female,Gender,1,0,1,NLD,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0.8,0.8,0.9,0.95,0.7,0.85,3.8,5.65,1.95,1.444,0.3496


## Data preparation

In [4]:
# Delete the columns 'ResponseID', ExtendedSessionID' and 'UserID' and 'Country'
df = df.drop(['ResponseID', 'ExtendedSessionID', 'UserID', 'Country'], axis=1)

In [5]:
# Normalize the numerical columns
num_cols = ['NumberOfCharacters', 'DiffNumberOFCharacters', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'Finance_access', 'ICT', 'Industry_activity', 'Overall_index', 'Research_and_development', 'Skills', 'Total', 'Males', 'Females', 'Passengers', 'Pedestrians']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [6]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,NumberOfCharacters,DiffNumberOFCharacters,Saved,Man,Woman,Pregnant,Stroller,OldMan,OldWoman,Boy,Girl,Homeless,LargeWoman,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat,Finance_access,ICT,Industry_activity,Overall_index,Research_and_development,Skills,Total,Males,Females,Passengers,Pedestrians
0,0,1,0,Female,Gender,-0.668147,-0.483425,0,-0.525204,-0.525207,-0.233453,-0.227917,-0.358872,1.579438,-0.349868,-0.35,-0.276869,1.921181,-0.35003,-0.227553,-0.304811,-0.304627,-0.35901,-0.358738,-0.288292,-0.288278,-0.299642,-0.299416,-0.643837,-1.399251,-1.720819,-1.557053,-1.10532,-0.622866,0.830778,0.997528,0.59001,2.645953,3.68565
1,1,0,0,Low,Social Status,-0.668147,-0.483425,0,-0.525204,-0.525207,-0.233453,-0.227917,-0.358872,-0.359132,-0.349868,-0.35,4.995655,-0.350023,-0.35003,-0.227553,-0.304811,-0.304627,-0.35901,-0.358738,-0.288292,-0.288278,-0.299642,-0.299416,0.805231,-0.200616,0.06293,0.993461,1.228532,0.190557,0.682811,0.516045,1.139455,0.443457,0.290316
2,0,0,0,Fat,Fitness,-0.668147,-0.483425,0,1.166728,-0.525207,-0.233453,-0.227917,-0.358872,-0.359132,-0.349868,-0.35,-0.276869,1.921181,-0.35003,-0.227553,-0.304811,-0.304627,-0.35901,-0.358738,-0.288292,-0.288278,-0.299642,-0.299416,-0.160815,0.998019,1.252096,0.264742,0.294991,0.190557,-0.954696,-0.903712,-1.016062,-0.863542,-0.950972
3,1,0,0,Male,Gender,-1.342726,-0.483425,0,-0.525204,-0.525207,-0.233453,-0.227917,1.580655,-0.359132,-0.349868,-0.35,-0.276869,-0.350023,-0.35003,-0.227553,-0.304811,-0.304627,-0.35901,-0.358738,-0.288292,-0.288278,-0.299642,-0.299416,0.805231,-0.200616,0.06293,0.993461,1.228532,0.190557,0.682811,0.516045,1.139455,0.443457,0.290316
4,0,0,0,Female,Gender,-1.342726,-0.483425,1,-0.525204,-0.525207,-0.233453,-0.227917,-0.358872,-0.359132,-0.349868,-0.35,-0.276869,-0.350023,-0.35003,-0.227553,-0.304811,2.593688,-0.35901,-0.358738,-0.288292,-0.288278,-0.299642,-0.299416,-0.160815,0.998019,1.252096,0.629101,-0.171779,1.00398,-1.033612,-0.990132,-1.100592,-1.162168,-1.190168


## Train and evaluate the model

In [7]:
# Split dataset in features and target variable
feature_cols = ['NumberOfCharacters', 'DiffNumberOFCharacters', 'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman', 'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal', 'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete', 'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat', 'Finance_access', 'ICT', 'Industry_activity', 'Overall_index', 'Research_and_development', 'Skills', 'Total', 'Males', 'Females', 'Passengers', 'Pedestrians']
label = 'Saved'
X = df[feature_cols] # Features
y = df[label] # Target variable

In [8]:
# split X and y into training, validation and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
param_grid = {
    'max_depth': [3, 5, 10, None],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 5],  # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy']  # Function to measure the quality of a split
}

In [10]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=kfold, scoring='accuracy')

In [None]:
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(f"Best parameters: {best_params}")

In [None]:
# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy after hyperparameter tuning: {accuracy * 100:.2f}%")

In [None]:
# Visualize the decision tree
tree.plot_tree(best_model)