In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,3.0,0.0,"O'Donoghue, Ms. Bridget",female,,0.0,0.0,364856,7.75,,Q,,,
1,2.0,0.0,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0.0,0.0,250655,26.0,,S,,,
2,2.0,1.0,"Smith, Miss. Marion Elsie",female,40.0,0.0,0.0,31418,13.0,,S,9,,
3,3.0,1.0,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31.0,1.0,1.0,363291,20.525,,S,C D,,"Strood, Kent, England Detroit, MI"
4,3.0,1.0,"McCoy, Miss. Agnes",female,,2.0,0.0,367226,23.25,,Q,16,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1009 non-null   float64
 1   survived   1009 non-null   float64
 2   name       1009 non-null   object 
 3   sex        1009 non-null   object 
 4   age        812 non-null    float64
 5   sibsp      1009 non-null   float64
 6   parch      1009 non-null   float64
 7   ticket     1009 non-null   object 
 8   fare       1008 non-null   float64
 9   cabin      229 non-null    object 
 10  embarked   1008 non-null   object 
 11  boat       374 non-null    object 
 12  body       98 non-null     float64
 13  home.dest  582 non-null    object 
dtypes: float64(7), object(7)
memory usage: 110.5+ KB


In [5]:
drop_columns = ['name', 'ticket', 'cabin', 'boat' , 'body', 'home.dest','embarked']


In [6]:
data_clean = data.drop(drop_columns, axis =1)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [8]:
data_clean['sex'] = le.fit_transform(data_clean['sex'])


In [9]:
data_clean.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare
0,3.0,0.0,0,,0.0,0.0,7.75
1,2.0,0.0,1,39.0,0.0,0.0,26.0
2,2.0,1.0,0,40.0,0.0,0.0,13.0
3,3.0,1.0,0,31.0,1.0,1.0,20.525
4,3.0,1.0,0,,2.0,0.0,23.25


In [10]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int32  
 3   age       812 non-null    float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1008 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


In [11]:
# can be done by imputer function from sklearn to fill the nan values
data_clean = data_clean.fillna(data_clean['age'].mean())

In [12]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1009 non-null   float64
 1   survived  1009 non-null   float64
 2   sex       1009 non-null   int32  
 3   age       1009 non-null   float64
 4   sibsp     1009 non-null   float64
 5   parch     1009 non-null   float64
 6   fare      1009 non-null   float64
dtypes: float64(6), int32(1)
memory usage: 51.4 KB


In [13]:
X = data_clean.drop(columns='survived')
Y = data_clean['survived']

In [14]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare
0,3.0,0,29.838978,0.0,0.0,7.75
1,2.0,1,39.0,0.0,0.0,26.0
2,2.0,0,40.0,0.0,0.0,13.0
3,3.0,0,31.0,1.0,1.0,20.525
4,3.0,0,29.838978,2.0,0.0,23.25


In [15]:
Y.head()

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: survived, dtype: float64

In [16]:
type(Y)

pandas.core.series.Series

In [17]:
type(X)

pandas.core.frame.DataFrame

In [26]:
# define entropy and infromation gain

def entropy(col):
    counts = np.unique(col, return_counts= True)
    N = float(col.shape[0])
    ent = 0.0
    
    for ix in counts[1]:
        p = ix/N
        ent += (-1.0 * p * np.log2(p))
    
    return ent
        

In [31]:
def divide_data(x_data, fkey, fval):
    # working with pandas dataframe
    x_right =[]
    x_left = []
    
    for ix in range(x_data.shape[0]):
        val = x_data[fkey].loc[ix]
        
        if val>fval:
            x_right.append(x_data.loc[ix])
        else:
            x_left.append(x_data.loc[ix])
    
    x_left = pd.DataFrame(x_left , columns= x_data.columns)
    x_right = pd.DataFrame(x_right, columns= x_data.columns)
    
    return x_left, x_right 

In [20]:
def information_gain(x_data, fkey, fval):
    
    left, right = divide_data(x_data, fkey, fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    
    # all examples come to one side!
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -10000000 #  min information gain
    
    i_gain = entropy(x_data.survived) - (l* entropy(left.survived)+ r*entropy(right.survived))
    
    return i_gain

In [34]:
for fx in X.columns:
    print(fx)
    print(information_gain(data_clean, fx, data_clean[fx].mean()))

pclass
0.055456910002982474
sex
0.19274737190850932
age
0.001955929827451075
sibsp
0.006492394392888956
parch
0.01975608012294816
fare
0.04242793401428169


In [42]:
class DecisionTree:
    
    #constructor
    def __init__(self, depth = 0 , max_depth = 5):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    
    def train(self, X_train):
        
        
        features = "pclass sex age sibsp parch fare".split(' ')
        info_gains = []
        
        for ix in features:
            
            i_gain = information_gain(X_train, ix, X_train[ix].mean())
            info_gains.append(i_gain)
                                
        
                            
        self.fkey = features[np.argmax(info_gains)]
        print('Making Tree feature is : ', self.fkey)                              
        self.fval = X_train[self.fkey].mean()
                                      
        data_left, data_right = divide_data(X_train, self.fkey, self.fval)
        data_left = data_left.reset_index(drop = True)
        data_right = data_right.reset_index(drop = True)
              
        # truely a left node
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            
            if X_train.survived.mean() >= 0.5:
                    self.target = 'survived'
            else:
                self.target = 'dead'
            return
                                      
        if(self.depth>= self.max_depth):
            
            if X_train.survived.mean() >= 0.5:
                self.target = 'survived'
            else:
                self.target = 'dead'
            return
        
        #recursive tree
        self.left = DecisionTree(depth= self.depth+1, max_depth= self.max_depth)
        self.left.train(data_left)
        
        self.right = DecisionTree(depth= self.depth+1, max_depth= self.max_depth)
        self.right.train(data_right)
        
        #you can set the target at every node
        if X_train.survived.mean() >= 0.5:  
            self.target = 'survived'
        else:
            self.target = 'dead'

        return     
    
    def predict(self, test):
        if(test[self.fkey])>self.fval:
            #go to right
            if self.right is None:
                return self.target
            return self.right.predict(test)
        else:
            if self.left is None:
                return self.target
            return self.left.predict(test)
    
            
        
                                      

In [44]:
#Train-Validation-Test Set Split
split = int(0.7*data_clean.shape[0])
train_data = data_clean[:split]
test_data = data_clean[split:]
test_data = test_data.reset_index(drop= True)



In [45]:
print(test_data.shape, train_data.shape)

(303, 7) (706, 7)


In [46]:
dt = DecisionTree()

In [47]:
dt.train(train_data)

Making Tree feature is :  sex
Making Tree feature is :  pclass
Making Tree feature is :  parch
Making Tree feature is :  fare
Making Tree feature is :  fare
Making Tree feature is :  fare
Making Tree feature is :  fare
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  fare
Making Tree feature is :  pclass
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  sibsp
Making Tree feature is :  fare
Making Tree feature is :  fare
Making Tree feature is :  parch
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree feature is :  parch
Making Tree feature is :  fare
Making Tree feature is :  parch
Making Tree feature is :  age
Making Tree feature is :  fare
Making Tree feature is :  fare
Making Tree feature is :  age
Making Tree feature is :  age
Making Tree fe

In [48]:
print(dt.fkey)

sex


In [None]:
y_pred = []
 