In [482]:
# import statements
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import sklearn

In [483]:
# read in data
train_df = pd.read_csv("data/train.csv") 
test_df = pd.read_csv("data/test.csv")

In [484]:
# preview the datatest_df.head()
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [485]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [486]:
# see info about data
train_df.info()
print("----------------------------")
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
----------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare

In [487]:
# a naive solution, ignoring data cleaning and other useful relationships



In [488]:
# set nan for age to the mean of the age column
train_df['Age'] = train_df.groupby(['Pclass'])['Age'].transform(lambda x: x.fillna(x.mean()))
test_df['Age'] = test_df.groupby(['Pclass'])['Age'].transform(lambda x: x.fillna(x.mean()))

# set nan fare to the mean of the fare column
train_df['Fare'] = train_df.groupby(['Pclass'])['Fare'].transform(lambda x: x.fillna(x.mean()))
test_df['Fare'] = test_df.groupby(['Pclass'])['Fare'].transform(lambda x: x.fillna(x.mean()))

# make nan for sex male since men are more likely to not
# survive, and most dont survive
train_df['Sex'] = train_df.groupby(['Pclass'])['Sex'].transform(lambda x: x.fillna('male'))
test_df['Sex'] = test_df.groupby(['Pclass'])['Sex'].transform(lambda x: x.fillna('male'))

#train_df['Age'] = train_df['Age'].astype(int)
#test_df['Age'] = test_df['Age'].astype(int)

#train_df.loc[train_df['Age'] <= 16, 'Age'] = 0
#train_df.loc[(train_df['Age'] > 16) & (train_df['Age'] <= 32), 'Age'] = 1
#train_df.loc[(train_df['Age'] > 32) & (train_df['Age'] <= 48), 'Age'] = 2
#train_df.loc[(train_df['Age'] > 48) & (train_df['Age'] <= 64), 'Age'] = 3
#train_df.loc[(train_df['Age'] > 64), 'Age'] = 4

#test_df.loc[ test_df['Age'] <= 16, 'Age'] = 0
#test_df.loc[(test_df['Age'] > 16) & (test_df['Age'] <= 32), 'Age'] = 1
#test_df.loc[(test_df['Age'] > 32) & (test_df['Age'] <= 48), 'Age'] = 2
#test_df.loc[(test_df['Age'] > 48) & (test_df['Age'] <= 64), 'Age'] = 3
#test_df.loc[(test_df['Age'] > 64), 'Age'] = 4



In [489]:
# make everything ints
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age'] = test_df['Age'].astype(int)
train_df['Fare'] = train_df['Fare'].astype(int)
# test_df['Fare'] = test_df['Fare'].astype(int)
# set men to 1 and female to 0
train_df['Sex'] = train_df['Sex'].map(lambda s : 1 if s == 'male' else 0)
test_df['Sex'] = test_df['Sex'].map(lambda s : 1 if s == 'male' else 0)

# clean data
test_df1 = test_df.drop(['Ticket','Cabin', 'Embarked', 'Name'], axis=1)
train_df1 = train_df.dropna()
train_df1 = train_df.drop(['Ticket','Cabin', 'Embarked', 'Name', 'PassengerId'], axis=1)
train_df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,1,22,1,0,7
1,1,1,0,38,1,0,71
2,1,3,0,26,0,0,7
3,1,1,0,35,1,0,53
4,0,3,1,35,0,0,8


In [490]:

# define data
X_train = train_df1.drop("Survived",axis=1)
Y_train = train_df1["Survived"]
X_test  = test_df1.drop("PassengerId",axis=1).copy()

In [491]:
# test random forest classifier
random_forest = RandomForestClassifier(n_estimators=100)
#random_forest = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=10, max_features='sqrt', min_samples_split=5)

random_forest.fit(X_train, Y_train)

Y_pred = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)

0.95735129068462399

In [492]:
# gradient boost
#gradient_boost = GradientBoostingClassifier(n_estimators=100,loss='exponential',max_features='log2')
gradient_boost = GradientBoostingClassifier(n_estimators=100)
gradient_boost.fit(X_train, Y_train)

Y_pred_2 = gradient_boost.predict(X_test)

gradient_boost.score(X_train, Y_train)

0.89450056116722787

In [493]:
# extra tree
#extra_tree = ExtraTreesClassifier(n_estimators=100,criterion='gini',max_depth=10,max_features='log2',min_samples_split=10)
extra_tree = ExtraTreesClassifier(n_estimators=100)
extra_tree.fit(X_train, Y_train)

Y_pred_3 = extra_tree.predict(X_test)

extra_tree.score(X_train, Y_train)

0.95735129068462399

In [494]:
# kNearest Neighbors
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 7,weights='distance')
knn.fit(X_train, Y_train)
Y_pred_4 = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

95.74

In [495]:
# Ada boost
ada_boost = AdaBoostClassifier(n_estimators=100,algorithm='SAMME')

ada_boost.fit(X_train, Y_train)

Y_pred_5 = ada_boost.predict(X_test)

ada_boost.score(X_train, Y_train)

0.82379349046015715

In [496]:
# Gaussian Naive Bayes

# kNearest Neighbors
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors = 7,weights='distance')
knn.fit(X_train, Y_train)
Y_pred_4 = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

95.74

In [497]:
# set which alg you want to predict
Y_pred = Y_pred_6

In [498]:
# create a submission CSV with 
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv('titanic.csv', index=False, columns=["Survived", "PassengerId"])

In [499]:
# attempt at improving our solution
df = train_df.append(test_df)

df['Age'] = df.groupby(['Pclass'])['Age'].transform(lambda x: x.fillna(x.mean()))

# set nan fare to the mean of the fare column
df['Fare'] = df.groupby(['Pclass'])['Fare'].transform(lambda x: x.fillna(x.mean()))

# set nan fare to the mean of the fare column
df['Survived'] = df.groupby(['Pclass'])['Survived'].transform(lambda x: x.fillna(0))

# make nan for sex male since men are more likely to not
# survive, and most dont survive
df['Sex'] = df.groupby(['Pclass'])['Sex'].transform(lambda x: x.fillna(1))

# make everything ints
df['Age'] = df['Age'].astype(int)

df['Fare'] = df['Fare'].astype(int)

df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22,,S,7,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171
1,38,C85,C,71,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599
2,26,,S,7,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282
3,35,C123,S,53,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803
4,35,,S,8,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450


In [500]:
# Family
# Set a family column to a bool.
# True if the passenger has family on the boat, F if not
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize
0,22,,S,7,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,1
1,38,C85,C,71,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,1
2,26,,S,7,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0
3,35,C123,S,53,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,1
4,35,,S,8,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0


In [501]:
# split by "." to extract people's titles
df['Title'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22,,S,7,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,1,Mr
1,38,C85,C,71,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,1,Mrs
2,26,,S,7,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,0,Miss
3,35,C123,S,53,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,1,Mrs
4,35,,S,8,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,0,Mr


In [502]:
# map embark chars to ints
def embark(e):
    if e == "S":
        return 0
    elif e == "C":
        return 1
    elif e == "Q":
        return 2
    else:
        return 3
# map embarked to ints
df['Embarked'] = df['Embarked'].map(lambda e: embark(e))
# create columns for if the passenger is in a family or is alone
df['Alone'] = df['FamilySize'].map(lambda s : 1 if s == 0 else 0)
df['Family'] = df['FamilySize'].map(lambda s : 1 if s>=3 else 0)

# map cabin NaN -> 0 else int representation of the first char in the string
df['Cabin'] = df['Cabin'].map(lambda c : 0 if pd.isnull(c) else ord(str(c[0])))

# take all of the titles and make seperate columns for each
titles_feature = pd.get_dummies(df['Title'],prefix='Title')
df = pd.concat([df,titles_feature],axis=1)

# drop columns we wont use anymore
df = df.drop(['Name','Ticket', 'Title'], axis=1)

# df.info()
df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,...,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir,Title_the Countess
0,22,0,0,7,0,1,3,1,1,0.0,...,0,0,0,0,1,0,0,0,0,0
1,38,67,1,71,0,2,1,0,1,1.0,...,0,0,0,0,0,1,0,0,0,0
2,26,0,0,7,0,3,3,0,0,1.0,...,0,1,0,0,0,0,0,0,0,0
3,35,67,0,53,0,4,1,0,1,1.0,...,0,0,0,0,0,1,0,0,0,0
4,35,0,0,8,0,5,3,1,0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [503]:
# partition data into train and test data
# (previously I merged the data so we would keep the same column #)
train = df[0:890]
test = df[891:1309]

In [504]:
# drop PassengerId and Survived
X_train = train.drop(["Survived"],axis=1)
Y_train = train["Survived"]
X_test  = test.drop(["Survived"],axis=1).copy()


In [505]:
# re-test Ada boost and create the .csv file for submission 
# This gets 78.5%
ada_boost = AdaBoostClassifier(n_estimators=100,algorithm='SAMME')

ada_boost.fit(X_train, Y_train)

Y_pred_5 = ada_boost.predict(X_test)

submission = pd.DataFrame({
    "Survived": Y_pred_5,
    "PassengerId": test["PassengerId"]
    })
submission.to_csv('titanic.csv', index=False)

In [471]:
# An attempt at a different solution using Stratified K-Fold to test classifiers.
# Did not majorly improve my results; got 77%
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
clf = ExtraTreesClassifier(n_estimators=200)
clf = clf.fit(X_train, Y_train)

In [472]:
# show the features of the clf and their importances
# importance means how important that feature is to predicting properly
features = pd.DataFrame()
features['feature'] = X_train.columns
features['importance'] = clf.feature_importances_
features.sort_values(['importance'],ascending=False)


Unnamed: 0,feature,importance
5,PassengerId,0.166031
0,Age,0.140473
7,Sex,0.118449
24,Title_Mr,0.111767
3,Fare,0.111116
6,Pclass,0.066435
1,Cabin,0.05872
25,Title_Mrs,0.03854
21,Title_Miss,0.036789
2,Embarked,0.031602


In [473]:
# create training models
model = SelectFromModel(clf, prefit=True)
training = model.transform(X_train)
training.shape

(890, 9)

In [474]:
# make sure all values in df are non-NaN.
X_test = X_test.fillna(method='ffill')

#create testing models
testing = model.transform(X_test)
testing.shape


(418, 9)

In [475]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier

# Use grid-search to classify with 5-folds. 
# Not 100% on all of this code, it works but confusing.
forest = RandomForestClassifier(max_features='sqrt')

parameter_grid = {'max_depth':[5],'n_estimators': [220],'criterion': ['gini','entropy']}

grid_search = GridSearchCV(forest,
                           param_grid=parameter_grid,
                           cv=StratifiedKFold(Y_train, n_folds=5))

grid_search.fit(training, Y_train)

# print best fold score.
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

pipeline = grid_search
output = pipeline.predict(testing).astype(int)
df_output = pd.DataFrame()
df_output['PassengerId'] = test['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('titanic-k-fold.csv',index=False)

Best score: 0.803370786517
Best parameters: {'n_estimators': 220, 'criterion': 'entropy', 'max_depth': 5}
