# Decision Tree Classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:
df=sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
#drop deck column
df.drop('deck',axis=1,inplace=True)

#impute missing values of age and fare
imputer= SimpleImputer(strategy='median')
df[['age','fare']] = imputer.fit_transform(df[['age','fare']])

#impute the missing values of embarked and embarked_town
imputer= SimpleImputer(strategy='most_frequent')
df[['embarked','embark_town']]= imputer.fit_transform(df[['embarked','embark_town']])


In [5]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

In [8]:
df.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
embark_town      object
alive            object
alone              bool
dtype: object

In [9]:
#encode the categorical and object varaiable using label encoder
le=LabelEncoder()
for col in df.columns:
    if df[col].dtype=='object' or df[col].dtype=='category':
        df[col]=le.fit_transform(df[col])



In [10]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,1,22.0,1,0,7.25,2,2,1,True,2,0,False
1,1,1,0,38.0,1,0,71.2833,0,0,2,False,0,1,False
2,1,3,0,26.0,0,0,7.925,2,2,2,False,2,1,True
3,1,1,0,35.0,1,0,53.1,2,0,2,False,2,1,False
4,0,3,1,35.0,0,0,8.05,2,2,1,True,2,0,True


In [11]:
#split the data into x and y
X=df.drop('survived',axis=1)
y=df['survived']
#split the data into train and test
#split the  data into train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
#create and train the model with prediction
model=DecisionTreeClassifier(criterion='entropy')
model.fit(X_train,y_train)

#predict the model
y_pred=model.predict(X_test)

#evaluate the model
print('confusion_matrix\n',confusion_matrix(y_pred,y_test))
print('classification_report\n',classification_report(y_pred,y_test))


confusion_matrix
 [[105   0]
 [  0  74]]
classification_report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       105
           1       1.00      1.00      1.00        74

    accuracy                           1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      1.00      1.00       179



In [16]:
#save the decisopn tree
from sklearn.tree import export_graphviz
export_graphviz(model,out_file='./saved Models/decision_tree.dot',feature_names=X.columns,rounded=True,filled=True)

In [1]:
import math

In [2]:
#Example Dataset
#Let's say we have a dataset with two classes, A and B
#Suppose in a dataset of 10 elements, 4 are of class A and 6 are of class B

#Number of elements in each class
n_A=4
n_B=6
total=n_A+n_B


In [3]:
#lets calculate the proportion
proportion_A=n_A/total
proportion_B=n_B/total
#print the proportions
print("Proportion of class A:",proportion_A)
print("Proportion of class B:",proportion_B)

Proportion of class A: 0.4
Proportion of class B: 0.6


In [4]:
#Enteopy calculate
entropy= -proportion_A*math.log2(proportion_A)-proportion_B*math.log2(proportion_B)

print("Entropy:",entropy)

Entropy: 0.9709505944546686


In [5]:
# Gini impurity
gini=1-proportion_A**2 - proportion_B**2 

print("Gini Impurity:",gini)

Gini Impurity: 0.48


In [6]:
#information gain
#Assuming a  aplit on some features divides the dataset into two subsets
#subset 1:2 elements of A, 3 Of  B
#subset 2:2 elements of A, 3 of 3
#Entropy and size for each subset
n_1_A,n_1_B=2,3
n_2_A,n_2_B=2,3

p_1_A=n_1_A / (n_1_A +n_1_B)
p_1_B=n_1_B / (n_1_A +n_1_B)
entropy_1= -p_1_A * math.log2(p_1_A) - p_1_B * math.log2(p_1_B) if p_1_A and p_1_B else 0

p_2_A=n_2_A / (n_2_A +n_2_B)
p_2_B=n_2_B / (n_2_A +n_2_B)
entropy_2= -p_2_A * math.log2(p_2_A) - p_2_B * math.log2(p_2_B) if p_2_A and p_2_B else 0
#calculate information gain
info_gain=entropy-((n_1_A + n_1_B)/total * entropy_1 + (n_2_A + n_2_B)/total * entropy_2)
print('information Gain:',info_gain)


information Gain: 0.0
