In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

In [3]:
import preprocessing
import algorithms

In [4]:
titanic_df = pd.read_csv('./titanic/train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df= titanic_df.drop('Survived',axis=1)

X_titanic_df = preprocessing.process_null_data(X_titanic_df)
X_titanic_df = preprocessing.drop_unnecessary_features(X_titanic_df)
X_titanic_df = preprocessing.process_label_encoding(X_titanic_df)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.3)

In [6]:
for model in ['dtc', 'rfc', 'lr']:
    predict_result = algorithms.processing(X_train, X_test, y_train, X_titanic_df, y_titanic_df, model)
    
    model_name = 'DecisionTreeClassifier'
    if model == 'rfc':
        model_name = 'RandomForestClassifier'
    elif model == 'lr':
        model_name = 'LogisticRegression'
    
    print('{0} Accuracy: {1:.4f}%'.format(model_name, accuracy_score(y_test, predict_result) * 100.0))
    print('=' * 50)

DecisionTreeClassifier Accuracy: 73.5075%
RandomForestClassifier Accuracy: 77.2388%
LogisticRegression Accuracy: 76.8657%


In [7]:
dtc = DecisionTreeClassifier()

In [8]:
scores = cross_val_score(dtc, X_titanic_df , y_titanic_df , cv=5)
for iter_count,accuracy in enumerate(scores):
    print("Cross Validation {0} Accuracy: {1:.4f}%".format(iter_count, accuracy * 100.0))

print("Average Accuracy: {0:.2f}%".format(np.mean(scores) * 100.0))

Cross Validation 0 Accuracy: 53.0726%
Cross Validation 1 Accuracy: 79.2135%
Cross Validation 2 Accuracy: 81.4607%
Cross Validation 3 Accuracy: 77.5281%
Cross Validation 4 Accuracy: 82.0225%
Average Accuracy: 74.66%


In [9]:
parameters = {'max_depth':[2,3,5,10],
             'min_samples_split':[2,3,5], 'min_samples_leaf':[1,5,8]}

grid_dclf = GridSearchCV(dtc , param_grid=parameters , scoring='accuracy' , cv=5)
grid_dclf.fit(X_train , y_train)

best_dclf = grid_dclf.best_estimator_

dpredictions = best_dclf.predict(X_test)
accuracy = accuracy_score(y_test , dpredictions)
print('Test Set DecisionTreeClassifier Accuracy : {0:.4f}%'.format(accuracy * 100.0))

Test Set DecisionTreeClassifier Accuracy : 76.4925%


In [10]:
validation = pd.read_csv('./titanic/test.csv')

In [11]:
validation = preprocessing.process_null_data(validation)
validation = preprocessing.drop_unnecessary_features(validation)
validation = preprocessing.process_label_encoding(validation)

In [12]:
result = best_dclf.predict(validation)

In [26]:
submission = pd.DataFrame({
    'PassengerId': validation['PassengerId'],
    'Survived': result
})

In [27]:
submission.to_csv('./submission.csv', index=False)