### Import Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the Titanic training dataset into a DataFrame.

data = pd.read_csv("titanic_train.csv", delimiter=',', encoding="utf-8-sig")

In [3]:
# checking the few records.  

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# Removing the cols which are not required for further processing.

col_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
data_clean = data.drop(col_to_drop, axis = 1)

In [6]:
# encoding the "Sex" and Embarked Cols.

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data_clean["Sex"] = le.fit_transform(data_clean["Sex"])

In [7]:
data_clean["Embarked"] = le.fit_transform(data_clean["Embarked"])

In [8]:
# Filling the Nan value of "Age" col with the mean value of it.

data_clean = data_clean.fillna(data_clean["Age"].mean())

In [9]:
# Checking whether there is a whitespace or not in the cols name.

# data_clean.columns = data_clean.columns.str.strip()
data_clean.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [10]:
X = data_clean.iloc[:,:-1]  # Feature Variable
Y = data_clean.iloc[:,-1]   # Label Variable
print(X.shape, Y.shape)

(891, 7) (891,)


##### Function to compute the entropy

In [11]:
def entropy(col):
    
    counts = np.unique(col, return_counts=True)  # returns--->(unique value in arr like-array(0,1), array(freq of 0, fre of 1))
    N = float(col.shape[0])
    
    ent = 0.0
    for ix in counts[1]:          # iterate over array(freq of 0, fre of 1) 
        p = ix/N
        
        ent += (-1*p*np.log2(p))
        
    return ent

In [12]:
# This function splits the input DataFrame into two subsets based on a specified feature (f_key) and threshold value (f_val).

def divide_data(data, f_key, f_val):
    x_right = pd.DataFrame(columns=data.columns)
    x_left = pd.DataFrame(columns=data.columns)

    for i in range(data.shape[0]):
        
        val = data.loc[i, f_key]   # Access a group of rows(i) and columns(f_key) by label(s) or a boolean array.
     
        if val > f_val:
            x_right.loc[i] = data.loc[i]
        
        else:
            x_left.loc[i] = data.loc[i]

    return x_left, x_right


In [13]:
# Example how it's gonna work

x_left, x_right = divide_data(data_clean, "Sex", 0.5)
# print(x_left)
# print(x_right)

In [14]:
def information_gain(data, f_key, f_value):
    left, right = divide_data(data, f_key, f_value)
    
    # % of total samples are on left and right
    l = float(left.shape[0])/data.shape[0]     # here left.shape[0] is 8 and left.shape[1] is 314 so instead of selecting 
    r = float(right.shape[0])/data.shape[0]  #left.shape[0] we have to select shape[1] or we can use x_left.transpose().shape[0]
    
#     print(left.shape[1])
#     print(data.shape[1])
    # if all sample come to one side i.e, ig is minimum 
    
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -1000000000 # min information gain
    
    # Information gain formula is used here on survived col of dataset
    
    i_gain = entropy(data["Survived"]) - (l*entropy(left["Survived"]) + r*entropy(right["Survived"]))
  
    return i_gain

In [15]:
# for fx in X.columns:
#     print(fx)
#     print(information_gain(data_clean, fx, data_clean[fx].mean()))

### Decision Tree classifier

In [16]:
class DesicionTree:
    
    
    # Constructor
    def __init__(self, depth = 0, max_depth = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None

    def train(self, x_train):
        
        features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
        max_igain = []
        
        for ix in features:
            i_gain = information_gain(x_train, ix, x_train[ix].mean())
            max_igain.append(i_gain)
            
        self.fkey = features[np.argmax(max_igain)]
        self.fval = x_train[self.fkey].mean()
        print("Making Tree features is", self.fkey)
        
        # Split Tree(Data)
        data_left, data_right = divide_data(x_train, self.fkey, self.fval)
        data_left = data_left.reset_index(drop = True)
        data_right = data_right.reset_index(drop = True)
        
  # Way to terminate the tree

        # Truly a leaf node by getting left or right node shape = 0
        
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x_train["Survived"].mean() >= 0.5:
                self.target = "Survived"
            else:
                self.target = "Dead"
            return
        
        # Stop early when depth >= max_depth
        if self.depth>=self.max_depth:
            if x_train["Survived"].mean() >= 0.5:
                self.target = "Survived"
            else:
                self.target = "Dead"
            return
        
        # When the base case is not hitting recursive case is used for making the left and right tree smaller sub-tree
        
        # Recursive Case
        self.right = DesicionTree(depth = self.depth+1, max_depth=self.max_depth)
        self.right.train(data_right)
        
        self.left = DesicionTree(depth = self.depth+1, max_depth = self.max_depth)
        self.left.train(data_left)
        
        
        
        # You can set the target at every node
        if x_train["Survived"].mean() >= 0.5:    
            self.target = "Survived"
        else:
            self.target = "Dead"
        return
    
    def predict(self,test):
        
        # go to right
        if test[self.fkey]>self.fval:

            if self.right is None:
                return self.target
            return self.right.predict(test)
        
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
        

In [17]:
d = DesicionTree()

# Train - Test data

In [18]:
split = int(0.7 * data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean[split:]
test_data = test_data.reset_index(drop = True)

In [19]:
# create an instance of the DecisionTree class to train and build a decision tree for classification.
dtree = DesicionTree()

In [20]:
dtree.train(train_data)

Making Tree features is Sex
Making Tree features is Fare
Making Tree features is Pclass
Making Tree features is SibSp
Making Tree features is Parch
Making Tree features is Age
Making Tree features is Age
Making Tree features is Age
Making Tree features is Age
Making Tree features is Age
Making Tree features is Age
Making Tree features is Age
Making Tree features is SibSp
Making Tree features is Parch
Making Tree features is Embarked
Making Tree features is Age
Making Tree features is Age
Making Tree features is Parch
Making Tree features is Age
Making Tree features is Embarked
Making Tree features is Age
Making Tree features is Age
Making Tree features is Fare
Making Tree features is SibSp
Making Tree features is Pclass
Making Tree features is Fare
Making Tree features is Fare
Making Tree features is Fare
Making Tree features is SibSp
Making Tree features is Age
Making Tree features is Fare
Making Tree features is Embarked
Making Tree features is Pclass
Making Tree features is Embarked

**Observation:**
The output indicates the process of building the decision tree. The lines "Making Tree features is..." correspond to the key feature that the tree is using to split the data at each node. Based on the highest information gain, the decision tree chooses a feature to divide the data and build the tree structure recursively.

In output, you can observe that the tree is selecting different features at different depths to create the splits. This process continues until either the tree reaches the specified maximum depth or a termination condition is met (such as all samples in a node belonging to the same class(other is zero) or when the depth limit is reached).

So we can say that this decision tree seems to be effectively learning the important features and their corresponding thresholds to make classification decisions.

In [21]:
# Now let's try to predict the labels

y_pred = []
for i in range(test_data.shape[0]):
    y_pred.append(dtree.predict(test_data.loc[i]))

In [22]:
y_actual = np.array(test_data["Survived"])

In [23]:
# Converting the predicted labels

y_pred = le.fit_transform(y_pred)

In [25]:
acc = np.sum(y_pred==y_actual)/y_pred.shape[0]
print("Accuracy:", acc)

Accuracy: 0.8097014925373134


**Conclusion:**

Based on the results obtained from training and evaluating the Decision Tree classifier:

The Decision Tree model achieved an accuracy of approximately 80.97% on the test dataset. This indicates that the model is able to predict the survival outcomes of passengers with a satisfactory level of accuracy. The model's performance suggests that it successfully learned important patterns and relationships within the training data, allowing it to generalize reasonably well to unseen data