In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import tree
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

PassengerID = test["PassengerId"]

#Cleaning up the data a bit

#Want to convert cabin to 0 = doesn't have cabin and 1 = does have cabin
train['Cabin'] = train["Cabin"].apply(lambda x: 1 if type(x) == str else 0)
train.head(20)

#Cobine siblings and parents to get a number of how many family members each passanger had on board
#Change gender from male/female to 0/1
train["Family_Members"] = train["SibSp"] + train["Parch"]
train["Sex"] = train["Sex"].replace(["male", "female"], [0, 1])

#Round the fare so cutoffs for categorization can be easily detemined 
train["Fare"] = train["Fare"].round()

#This will help decide where to make cutoffs (<=50, <=100, <=200, <=300, >300)
#sorted(train["Fare"].unique())
train.loc[train["Fare"] <= 50, "Fare"] = 0
train.loc[(train["Fare"] > 50) & (train["Fare"] <=100), "Fare"] = 1
train.loc[(train["Fare"] > 100) & (train["Fare"] <=200), "Fare"] = 2
train.loc[(train["Fare"] > 200) & (train["Fare"] <=300), "Fare"] = 3
train.loc[train["Fare"] > 300, "Fare"] = 4
#sorted(train["Fare"].unique())
#Data type is a float for Fare, so want to make it integers
train["Fare"] = train["Fare"].astype(int)

#Create groupings for Age <=11, <=18, <=25, <=35, <=50, <=65, 65> 
train["Age"] = train["Age"].round().fillna(0)
train["Age"] = train["Age"].astype(int)

train.loc[train["Age"] <= 11, "Age"] = 0
train.loc[(train["Age"] > 11) & (train["Age"] <= 18), "Age"] = 1
train.loc[(train["Age"] > 18) & (train["Age"] <= 25), "Age"] = 2
train.loc[(train["Age"] > 25) & (train["Age"] <= 35), "Age"] = 3
train.loc[(train["Age"] > 35) & (train["Age"] <= 50), "Age"] = 4
train.loc[(train["Age"] > 50) & (train["Age"] <= 65), "Age"] = 5
train.loc[train["Age"] > 65, "Age"] = 6

#Will ignore title for now but want to add it in later
#Extract title from Name
train["Title"] = train["Name"].astype(str).apply(lambda x: x.split()[1])
#train["Title"].unique()
train["Title"] = train["Title"].replace(["Mme.", "Mlle.", "Ms."], ["Mrs.", "Miss.", "Miss."]) 
#train["Title"] = train["title"].replace(["Planke"])

#Drop columns that have been recalculated or won't be used
drop_columns = ["Ticket", "SibSp", "Parch", "Embarked", "Name", "Title"]
train = train.drop(drop_columns, axis = 1)


X = train.values[:, 2:8]
y = train.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

clsf = DecisionTreeClassifier().fit(X_train, y_train)
y_pred = clsf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))


cv = KFold(n_splits=10)            # Desired number of Cross Validation folds
accuracies = list()
max_attributes = len(list(test))
depth_range = range(1, max_attributes + 1)

for depth in depth_range:
    fold_accuracy = []
    tree_model = tree.DecisionTreeClassifier(max_depth = depth)
    # print("Current max depth: ", depth, "\n")
    for train_fold, valid_fold in cv.split(train):
        f_train = train.loc[train_fold] # Extract train data with cv indices
        f_valid = train.loc[valid_fold] # Extract valid data with cv indices
        
        model = tree_model.fit(X = f_train.drop(['Survived'], axis=1), 
                               y = f_train["Survived"]) # We fit the model with the fold train data
        valid_acc = model.score(X = f_valid.drop(['Survived'], axis=1), 
                                y = f_valid["Survived"])# We calculate accuracy with the fold validation data
        fold_accuracy.append(valid_acc)   
        
        
    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)
    # print("Accuracy per fold: ", fold_accuracy, "\n")
    # print("Average accuracy: ", avg)
    # print("\n")
    
# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))

Accuracy: 0.7350746268656716
 Max Depth  Average Accuracy
         1          0.786729
         2          0.771099
         3          0.804707
         4          0.775531
         5          0.799064
         6          0.782322
         7          0.769963
         8          0.741785
         9          0.731710
        10          0.769900
        11          0.749725
