# Titanic Classifier Final

In [387]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler


In [388]:
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [389]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


### Missing data handling

In [390]:
df_new = df.copy()
new_ages = df[['Sex','Age','Pclass']].groupby(['Sex','Pclass']).median()
df_new['Age']=df.apply(\
                   lambda x: new_ages.loc[x['Sex']].loc[x['Pclass']][0] \
                   if pd.isnull(x['Age']) else x['Age'], axis=1)

In [391]:
df_new.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [392]:
#If Cabin is not Nan set it 1
#Else set them to 0

df_new['Cabin'].fillna(0,inplace=True)
df_new.loc[df_new['Cabin'] != 0, 'Cabin'] = 1
df_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,0,S


In [393]:
#Drop three columns, Name, Ticket, PassengerId
df_new.drop('Name',axis=1, inplace=True)
df_new.drop('Ticket',axis=1, inplace=True)
df_new.drop('PassengerId',axis=1, inplace=True)
df_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,0,S
1,1,1,female,38.0,1,0,71.2833,1,C
2,1,3,female,26.0,0,0,7.925,0,S
3,1,1,female,35.0,1,0,53.1,1,S
4,0,3,male,35.0,0,0,8.05,0,S


In [394]:
#Categorical data to numerical data
lb_make = LabelEncoder()
for c in df_new.columns:
    if 'object' == df_new[c].dtype:
        df_new[c] = lb_make.fit_transform(df_new[c].astype(str))
df_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,0,2
1,1,1,0,38.0,1,0,71.2833,1,0
2,1,3,0,26.0,0,0,7.925,0,2
3,1,1,0,35.0,1,0,53.1,1,2
4,0,3,1,35.0,0,0,8.05,0,2


In [395]:
#Normalizing
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df_new) 
df_new.loc[:,:] = scaled_values
df_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0.0,1.0,1.0,0.271174,0.125,0.0,0.014151,0.0,0.666667
1,1.0,0.0,0.0,0.472229,0.125,0.0,0.139136,1.0,0.0
2,1.0,1.0,0.0,0.321438,0.0,0.0,0.015469,0.0,0.666667
3,1.0,0.0,0.0,0.434531,0.125,0.0,0.103644,1.0,0.666667
4,0.0,1.0,1.0,0.434531,0.0,0.0,0.015713,0.0,0.666667


In [396]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import train_test_split

In [397]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB(priors=None)
]

In [398]:
y = df_new.pop('Survived')
X = df_new
X_train,X_test,y_train,y_test = train_test_split(X ,y, test_size=0.2)

In [399]:
#From https://www.kaggle.com/jeffd23/10-classifier-showdown-in-scikit-learn

#Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions_1 = clf.predict(X_test)
    print(train_predictions_1.sum())
    acc = accuracy_score(y_test, train_predictions_1)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions_2 = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions_2)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
58.0
Accuracy: 79.8883%
Log Loss: 3.86786681597381
SVC
****Results****
51.0
Accuracy: 79.3296%
Log Loss: 0.5060009828685266
DecisionTreeClassifier
****Results****
63.0
Accuracy: 75.9777%
Log Loss: 8.125994715707684
RandomForestClassifier
****Results****
61.0
Accuracy: 82.6816%
Log Loss: 2.235333590015011
GaussianNB
****Results****
61.0
Accuracy: 78.2123%
Log Loss: 0.6886509770726547


### Applying to the test data

In [400]:
#Full training with full train data
#Pick the most accurate classifier
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.0)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [401]:
#Load the test data

df = pd.read_csv('./data/test.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [402]:
#Same thing... in one cell.
df_new = df.copy()
new_ages = df[['Sex','Age','Pclass']].groupby(['Sex','Pclass']).median()
df_new['Age']=df.apply(\
                   lambda x: new_ages.loc[x['Sex']].loc[x['Pclass']][0] \
                   if pd.isnull(x['Age']) else x['Age'], axis=1)
df_new['Cabin'].fillna(0,inplace=True)
df_new.loc[df_new['Cabin'] != 0, 'Cabin'] = 1
df_new.drop('Name',axis=1, inplace=True)
df_new.drop('Ticket',axis=1, inplace=True)
PassengerIds = df_new.pop('PassengerId')

lb_make = LabelEncoder()
for c in df_new.columns:
    if 'object' == df_new[c].dtype:
        df_new[c] = lb_make.fit_transform(df_new[c].astype(str))
df_new.fillna(0,inplace=True)
scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(df_new) 
df_new.loc[:,:] = scaled_values
df_new.head()



Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1.0,1.0,0.452723,0.0,0.0,0.015282,0.0,0.5
1,1.0,0.0,0.617566,0.125,0.0,0.013663,0.0,1.0
2,0.5,1.0,0.815377,0.0,0.0,0.018909,0.0,0.5
3,1.0,1.0,0.353818,0.0,0.0,0.016908,0.0,1.0
4,1.0,0.0,0.287881,0.125,0.111111,0.023984,0.0,1.0


In [403]:
result = clf.predict(df_new)

In [404]:
output = pd.DataFrame(PassengerIds)

In [405]:
output['Survived'] = result.astype(int)

In [406]:
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [407]:
output.to_csv('./data/output.csv', index=False)

In [None]:
### Make better 