In [39]:
import pandas as pd

test = pd.read_csv('./titanic/test.csv')
train = pd.read_csv('./titanic/train.csv')

train_test_data = [train, test]

# # create Title column from Name #
for dataset in train_test_data:
    dataset['Title'] = dataset['Name'].str.extract(r'([A-Za-z]+)\.', expand=False)

# # set numeric value in Title field #
title_mapping = {
    "Mr" : 0,
    "Miss" : 1,
    "Mrs" : 2,
    "Master" : 3,
    "Dr" : 3,
    "Rev" : 3,
    "Col" : 3,
    "Major" : 3,
    "Mlle" : 3,
    "Countess" : 3,
    "Ms" : 3,
    "Lady" : 3,
    "Jonkheer" : 3,
    "Don" : 3,
    "Dona" : 3,
    "Mme" : 3,
    "Capt" : 3,
    "Sir" : 3,
}

for dataset in train_test_data:
    dataset['Title'] = dataset["Title"].map(title_mapping)



sex_mapping = {
    "male" : 0,
    "female" : 1,
}

for dataset in train_test_data:
    dataset['Sex'] = dataset["Sex"].map(sex_mapping)


# # fill missing age with median #
train['Age'] = train['Age'].fillna(train.groupby('Title')['Age'].transform('median'))
test['Age'] = test['Age'].fillna(test.groupby('Title')['Age'].transform('median'))

# # set numeric value in Age field #
for dataset in train_test_data:
    dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 26), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 26) & (dataset['Age'] <= 36), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 36) & (dataset['Age'] <= 62), 'Age'] = 3
    dataset.loc[dataset['Age'] > 62, 'Age'] = 4

# # Most passengers embarked from S, so fill missing data with s #
for dataset in train_test_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

embarked_mapping = {
    "S" : 0,
    "C" : 1,
    "Q" : 2,
}

for dataset in train_test_data:
    dataset['Embarked'] = dataset["Embarked"].map(embarked_mapping)

# # Missing Fare fill with median #
train['Fare'] = train['Fare'].fillna(train.groupby('Pclass')['Fare'].transform('median'))
test['Fare'] = test['Fare'].fillna(test.groupby('Pclass')['Fare'].transform('median'))

# # Fare classification #
for dataset in train_test_data:
    dataset.loc[dataset['Fare'] <= 17, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2
    dataset.loc[dataset['Fare'] > 100, 'Fare'] = 3


for dataset in train_test_data:
    dataset['Cabin'] = dataset['Cabin'].str[:1]

cabin_mapping = {
    "A" : 0,
    "B" : 0.4,
    "C" : 0.8,
    "D" : 1.2,
    "E" : 1.6,
    "F" : 2.0,
    "G" : 2.4,
    "T" : 2.8,
}

for dataset in train_test_data:
    dataset['Cabin'] = dataset["Cabin"].map(cabin_mapping)

train['Cabin'] = train['Cabin'].fillna(train.groupby('Pclass')['Cabin'].transform('median'))
test['Cabin'] = test['Cabin'].fillna(test.groupby('Pclass')['Cabin'].transform('median'))

# # create new column FamilySize #
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

family_size_mapping = {
    1: 0.0, 
    2: 0.4,
    3: 0.8,
    4: 1.2,
    5: 1.6,
    6: 2.0,
    7: 2.4,
    8: 2.8,
    9: 3.2,
    10: 3.6,
    11: 4
}

for dataset in train_test_data:
    dataset['FamilySize'] = dataset['FamilySize'].map(family_size_mapping)

# # delete unnecessary feature from dataset #
feature_drop = ['Ticket', 'SibSp', 'Parch', 'Name']
train.drop(feature_drop ,axis=1, inplace=True)
test.drop(feature_drop ,axis=1, inplace=True)
train.drop('PassengerId' ,axis=1, inplace=True)

In [77]:
import numpy as np
# # importing classifier module #
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# # KFold for cross-validation #
from sklearn.model_selection import KFold
# # to evaluate model accuracy across multiple data splits #
from sklearn.model_selection import cross_val_score


# # Creates a 10-fold cross-validator, shuffling data before splitting, random_state=0 ensures reproducibility #
K_fold = KFold(n_splits=10, shuffle=True, random_state=0)

clf = DecisionTreeClassifier()

X = train.drop('Survived', axis=1)
y = train['Survived']

# # Performs 10-fold cross-validation, scoring='accuracy': accuracy is used as the performance metric, n_jobs=1: runs on a single CPU core #
score = cross_val_score(clf, X, y, cv=K_fold, n_jobs=1, scoring='accuracy')
# print(score)

# # decision tree score, prints the average accuracy across all 10 folds #
print(round(np.mean(score)*100, 2))

80.14


In [79]:
# # Train Final Model with Random Forest, creates a Random Forest model with 13 trees #
clf = RandomForestClassifier(n_estimators=13)

# # Trains the Random Forest model on the entire training set #
clf.fit(X,y)

# # Removes PassengerId because it's not a feature #
test_data = test.drop('PassengerId', axis=1).copy()
# # Predict on Test Set #
prediction = clf.predict(test_data)

# # Create Submission File #
submission = pd.DataFrame({
    "PassengerId":test['PassengerId'],
    "Survived":prediction
})

submission.to_csv('titanic/submission.csv', index=False)