# Importing libraries
**pandas** for data manipulation and analysis <br>
**numpy** for numerical computing <br>
**matplotlib** for printing graphs <br>
**sklearn.metrics** for 'accuracy_score' <br>
**train_test_split** for splitting data <br>
**DecisionTreeClassifier** for predictions <br>
**ParameterGrid**

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ParameterGrid

# Helps plotting graphs (source: BI-ML1, first notebook: 01_tutorial_cs_template)
%matplotlib inline

In [2]:
data = pd.read_csv("data.csv")
print("First 10 records\n-----------------")
display(data.head(10))
print("Basic info\n-----------------")
display(data.info())
print("Number of uniques records\n-----------------")
display(data.nunique())

First 10 records
-----------------


Unnamed: 0,ID,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
0,0,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S,
1,1,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S,"Australia Fingal, ND"
2,2,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0.0,,S,Belfast
3,3,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,
4,4,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C,"New York, NY"
5,5,0,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C,
6,6,0,1,"Astor, Col. John Jacob",male,47.0,1,0,PC 17757,227.525,C62 C64,C,"New York, NY"
7,7,1,2,"Drew, Master. Marshall Brines",male,8.0,0,2,28220,32.5,,S,"Greenport, NY"
8,8,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7292,D33,C,"New York, NY"
9,9,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19.0,1,0,2908,26.0,,S,"Norwich / New York, NY"


Basic info
-----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         1000 non-null   int64  
 1   survived   1000 non-null   int64  
 2   pclass     1000 non-null   int64  
 3   name       1000 non-null   object 
 4   sex        1000 non-null   object 
 5   age        797 non-null    float64
 6   sibsp      1000 non-null   int64  
 7   parch      1000 non-null   int64  
 8   ticket     1000 non-null   object 
 9   fare       999 non-null    float64
 10  cabin      222 non-null    object 
 11  embarked   999 non-null    object 
 12  home.dest  573 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 101.7+ KB


None

Number of uniques records
-----------------


ID           1000
survived        2
pclass          3
name          999
sex             2
age            94
sibsp           7
parch           7
ticket        754
fare          249
cabin         156
embarked        3
home.dest     314
dtype: int64

# Preparing dataset

Drop columns we wont need in our dataset for testing

Get rid of 'NULL' values or change them to number

Change 'object' datatypes:

    1) Ordinal = object -> category -> use: cat.codes
    
    2) Nominal = one-hot encoding -> use: pd.get_dummies()

In [3]:
# Drop columns we wont need
drop_columns = ['ID' ,'name', 'cabin', 'home.dest']
test_data = data.drop(drop_columns, axis = 1)

# Get number of 'NULL' values for each column
print(test_data.isnull().sum())

# Change 'NULL' values to 0 and check result
test_data = test_data.fillna(0)
#print(test_data.isnull().sum())

# 'sex' and 'embarked' are nominal
test_data = pd.get_dummies(test_data)

survived      0
pclass        0
sex           0
age         203
sibsp         0
parch         0
ticket        0
fare          1
embarked      1
dtype: int64


# Decision tree
### Prepare explained variable and datasets

Split data into train, validation and test datasets

In [4]:
random_seed = 333

# Get rid of out explained variable in dataset
Xdata = test_data.drop("survived", axis = 1)
ydata = test_data.survived

# Split data: 60% for train data, 40% rest
Xtrain, Xtest, ytrain, ytest = train_test_split(Xdata, ydata, test_size=0.4, random_state=random_seed) 
print("Xtrain: ", Xtrain.shape, "\nytrain: ", ytrain.shape, "\nXtest: ", Xtest.shape, "\nytest: ", ytest.shape, "\n")

# Split rest of data (40%) into valid and test data, 50%/50%
Xval, Xtest, yval, ytest = train_test_split(Xtest, ytest, test_size=0.5, random_state=random_seed) 
print("Xval: ", Xval.shape, "\nyval: ", yval.shape, "\nXtest: ", Xtest.shape, "\nytest: ", ytest.shape)

Xtrain:  (600, 765) 
ytrain:  (600,) 
Xtest:  (400, 765) 
ytest:  (400,) 

Xval:  (200, 765) 
yval:  (200,) 
Xtest:  (200, 765) 
ytest:  (200,)


### Find hyperparameters and learn the model
Find best hyperparameters for our model

Let the model learn with these hyperparameters

In [5]:
# Find best hyperparameters (depths, criterions)
get_param = {
    'max_depth': range(1,101), 
    'criterion': ['entropy', 'gini']
}
param_comb = ParameterGrid(get_param)

# Try all combinations (depth={1, 100} * criterions={'entropy', 'gini'})
val_acc = []
train_acc = []
for params in param_comb:
    clf = DecisionTreeClassifier(max_depth=params['max_depth'], criterion=params['criterion'])
    clf.fit(Xtrain, ytrain)
    train_acc.append(metrics.accuracy_score(ytrain, clf.predict(Xtrain)))
    val_acc.append(metrics.accuracy_score(yval, clf.predict(Xval)))

# Get best parameters from function above
best_param = param_comb[np.argmax(val_acc)]
print("\nBest_params:", best_param, "\n")

# Final result
clf = DecisionTreeClassifier(**best_param)
clf.fit(Xtrain, ytrain)
print('Accuracy score (train): {0:.6f}'.format(metrics.accuracy_score(ytrain, clf.predict(Xtrain))))
print('Accuracy score (validation): {0:.6f}'.format(metrics.accuracy_score(yval, clf.predict(Xval))))
print('Accuracy score (test): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))


Best_params: {'max_depth': 11, 'criterion': 'entropy'} 

Accuracy score (train): 0.938333
Accuracy score (validation): 0.800000
Accuracy score (test): 0.795000


# Decision tree for evaluation.cvs
Load new dataset

Correct dataset like in 'data.csv'

Try our model on new dataset

In [6]:
eval_data = pd.read_csv("evaluation.csv")

# Drop the same columns like in data.csv
eval_data = eval_data.drop(drop_columns, axis = 1)

# Change NULL values to 0
eval_data = eval_data.fillna(0)

# 'sex' and 'embarked' are nominal -> one_hot_encoding = get_dummies()
eval_data = pd.get_dummies(eval_data)

# Change dataset for testing 'evaluation.csv'
Xdata = eval_data

# Final result
clf = DecisionTreeClassifier(**best_param)
clf.fit(Xtrain, ytrain)
print('Accuracy score (evaluation.csv): {0:.6f}'.format(metrics.accuracy_score(ytest, clf.predict(Xtest))))

Accuracy score (evaluation.csv): 0.800000


# KNN