In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

titanic_df = pd.read_csv('.../titanic_train.csv')
titanic_df.head()

In [None]:
print('\n ## Train data information ## \n')
print(titanic_df.info())

In [None]:
# 891 Rows
# 12 Columns
# 2 float64 columns
# 5 int64 columns
# 5 objct columns
# And so on

In [None]:
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N', inplace=True)
titanic_df['Embarked'].fillna('N', inplace=True)
print('Number of Null is', titanic_df.isnull().sum().sum())

In [None]:
print('Distribution of Sex value: \n', titanic_df['Sex'].value_counts())
print('Distribution of Cabin value: \n', titanic_df['Cabin'].value_counts())
print('Distribution of Embarked value: \n', titanic_df['Embarked'].value_counts())

In [None]:
# In terms of Cabin data, there are some values having a multiple expression
# Assumption: the first character means the level of the room (cabin) and it is important

In [None]:
titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]
print(titanic_df['Cabin'].head())

In [None]:
titanic_df.groupby(['Sex', 'Survived'])['Survived'].count()
# Survived 0 is died and 1 is survived

In [None]:
sns.barplot(x='Sex', y='Survived', data=titanic_df)

In [None]:
sns.barplot(x='Pclass', y='Survived', hue='Sex', data=titanic_df)

In [None]:
def get_category(age):
    cat=''
    if age <=-1: cat='Unknown'
    elif age <=5: cat='Baby'
    elif age <=12: cat='Child'
    elif age <=18: cat='Teenager'
    elif age <=25: cat='Student'
    elif age <=35: cat='Young Adult'
    elif age <=60: cat='Adult'
    else : cat='Elderly'

    return cat

plt.figure(figsize=(10,6))

# This is to make an order in the graph
group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Eldery']

# 
# 
titanic_df['Age_cat']=titanic_df['Age'].apply(lambda x : get_category(x))
sns.barplot(x='Age_cat', y='Survived', hue='Sex', data=titanic_df, order=group_names)
titanic_df.drop('Age_cat', axis=1, inplace=True)

In [None]:
# This is to convert the string characteristics to the numerical
from sklearn import preprocessing

def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
    return dataDF

titanic_df = encode_features(titanic_df)
titanic_df.head()

In [None]:
# Handle Null
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# Eliminate Unnecessary features
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# Execute label encoding
def format_features(df):
    df['Cabin']=df['Cabin'].str[:1]
    features=['Cabin','Sex','Embarked']
    for feature in features:
        le=preprocessing.LabelEncoder()
        le=le.fit(df[feature])
        df[feature]=le.transform(df[feature])
    return df

# Call the previous preprocessing function
def transform_features(df):
    df=fillna(df)
    df=drop_features(df)
    df=format_features(df)
    return df

In [None]:
titanic_df.head()

In [None]:
# Reload the raw data set and export feature data set and label data set
titanic_df = pd.read_csv('.../titanic_train.csv')

In [None]:
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived',axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [None]:
from sklearn.model_selection import train_test_split
# Test data size is 20%
# random_state=11 is to make the same result in every examples, so you could delete this part when you are doing your own study
X_train, X_test, y_train, y_test=train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)


In [None]:
# Scikit-learn provides three different Decision trees: DescisionTreeClassifier, RandomForesetClassifier, and LogisticRegression
# solver='liblinear' is an optimazation algorithm in LogisticRegression and shows a good performance in binary sorted (relatively small) data set 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
lr_clf = LogisticRegression(solver='liblinear')

# DecisionTreeClassifier
dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('Accuracy of DecisionTreeClassifier: {0:.4f}'.format(accuracy_score(y_test,dt_pred)))

# RandomForestClassifier
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('Accuracy of RandomForestClassifier: {0:.4f}'.format(accuracy_score(y_test,rf_pred)))

# LogisticRegession
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('Accuracy of LogisticRegression: {0:.4f}'.format(accuracy_score(y_test,lr_pred)))

In [None]:
# Even if we didnt do any of optimization work and there is no sufficient data size, lr_clf showed better performance

In [None]:
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    # Create KFold having 5 folds 
    kfold=KFold(n_splits=folds)
    scores=[]

    # Conduct KFold cross validation
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_titanic_df)):
        # In X_titanic_df data set, create indexes for each of test and train data
        X_train, X_test = X_titanic_df.values[train_index], X_titanic_df.values[test_index]
        y_train, y_test = y_titanic_df.values[train_index], y_titanic_df.values[test_index]
        # Calculate a classfier's accuracy of train and prediction
        clf.fit(X_train, y_train)
        predictions=clf.predict(X_test)
        accuracy=accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print('Cross validation {0} accuracy: {1:.4f}'.format(iter_count, accuracy))

    mean_score=np.mean(scores)
    print("Average accuracy: {0:.4f}".format(mean_score))

exec_kfold(dt_clf, folds=5)


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, X_titanic_df, y_titanic_df, cv=5)

for iter_count, accuracy in enumerate(scores):
    print('Accuracy of Cross validation {0}: {1:.4f}'.format(iter_count, accuracy))

print('Average accuracy: {0:.4f}'.format(np.mean(scores)))


In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':[2,3,5,10], 'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

grid_dclf = GridSearchCV(dt_clf, param_grid=parameters, scoring='accuracy', cv=5)
grid_dclf.fit(X_train, y_train)

print('GridSearchCVs the optimized hyper parameter:', grid_dclf.best_params_)
print('GridSearchCVs the highest accuracy: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

# Conduct the prediction and evaluation with the estimator studied by GridSearchCV's the optimized hyper parameter
dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test, dpredictions)
print('DecisionTreeClassifier accuracy in the test data set: {0:.4f}'.format(accuracy))
