In [40]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [152]:
# TRAIN CLEANING
train=pd.read_csv("train.csv", sep=",", header=0, index_col=0)

# NA fills
train.Age.fillna(value=train.Age.mean(), inplace=True)
train.Fare.fillna(value=train.Fare.mean(), inplace=True)
train.Embarked.fillna(value=(train.Embarked.value_counts().idxmax()), inplace=True)
train.Survived.fillna(value=-1, inplace=True)

# extract title from name
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

titles = sorted(set([x for x in train.Name.map(lambda x: get_title(x))]))

# Normalize the titles
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir', 'Master']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

train['Title'] = train['Name'].map(lambda x: get_title(x))
train['Title'] = train.apply(replace_titles, axis=1)

# converting Sex feature to columns with 0-1
SexNum = pd.DataFrame(pd.get_dummies(train['Sex']))
train.drop('Sex', axis=1, inplace=True)
train = train.join(SexNum)

# converting Embarked feature to a 0-1
EmbarkedNum = pd.DataFrame(pd.get_dummies(train['Embarked']))
train.drop('Embarked', axis=1, inplace=True)
train = train.join(EmbarkedNum)

# converting Title feature to a 0-1
TitleNum = pd.DataFrame(pd.get_dummies(train['Title']))
train.drop('Title', axis=1, inplace=True)
train = train.join(TitleNum)

# family size
fsiz = pd.DataFrame(train.apply(lambda x: x.SibSp+x.Parch, axis=1), columns=["FSize"])
train = train.join(fsiz)

# drop useless columns
train.drop('Name', axis=1, inplace=True)
train.drop('Cabin', axis=1, inplace=True)
train.drop('Ticket', axis=1, inplace=True)
train.drop('Parch', axis=1, inplace=True)
train.drop('SibSp', axis=1, inplace=True)
train.drop('FSize', axis=1, inplace=True)

In [162]:
# TEST CLEANING
test=pd.read_csv("test.csv", sep=",", header=0, index_col=0)

# NA fills
test.Age.fillna(value=test.Age.mean(), inplace=True)
test.Fare.fillna(value=test.Fare.mean(), inplace=True)
test.Embarked.fillna(value=(test.Embarked.value_counts().idxmax()), inplace=True)

# extract title from name
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

titles = sorted(set([x for x in test.Name.map(lambda x: get_title(x))]))

# Normalize the titles
def replace_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir', 'Master']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

test['Title'] = test['Name'].map(lambda x: get_title(x))
test['Title'] = test.apply(replace_titles, axis=1)

# converting Sex feature to columns with 0-1
SexNum = pd.DataFrame(pd.get_dummies(test['Sex']))
test.drop('Sex', axis=1, inplace=True)
test = test.join(SexNum)

# converting Embarked feature to a 0-1
EmbarkedNum = pd.DataFrame(pd.get_dummies(test['Embarked']))
test.drop('Embarked', axis=1, inplace=True)
test = test.join(EmbarkedNum)

# converting Title feature to a 0-1
TitleNum = pd.DataFrame(pd.get_dummies(test['Title']))
test.drop('Title', axis=1, inplace=True)
test = test.join(TitleNum)

# family size
fsiz = pd.DataFrame(test.apply(lambda x: x.SibSp+x.Parch, axis=1), columns=["FSize"])
test = test.join(fsiz)
  
# drop useless columns
test.drop('Name', axis=1, inplace=True)
test.drop('Cabin', axis=1, inplace=True)
test.drop('Ticket', axis=1, inplace=True)
test.drop('Parch', axis=1, inplace=True)
test.drop('SibSp', axis=1, inplace=True)
test.drop('FSize', axis=1, inplace=True)
test.drop('Dona', axis=1, inplace=True)

In [180]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test

In [181]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log



80.25

In [170]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc




89.11

In [171]:
# k-Nearest Neighbors

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

84.51

In [173]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

71.94

In [175]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

97.98

In [176]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

97.98

In [182]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 
              'Support Vector Machines', 
              'KNN',
              'Stochastic Gradient Decent',
              'Decision Tree',
              'Random Forest'],
    'Score': [acc_log, acc_svc, acc_knn, acc_sgd, acc_decision_tree, acc_random_forest]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
4,Decision Tree,97.98
5,Random Forest,97.98
1,Support Vector Machines,89.11
2,KNN,84.51
0,Logistic Regression,80.25
3,Stochastic Gradient Decent,71.94
