In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Machine Learning Imports
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

# For evaluating our ML results
from sklearn import metrics



In [2]:
titanic_df = pd.read_csv('train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


In [4]:
# filling Age for passengers for whom Age is not available
titanic_df['Age'].fillna(titanic_df['Age'].median(),inplace='true')
titanic_df['Embarked'].fillna('S',inplace='true')

In [5]:
titanic_df['Family_Count'] = titanic_df['SibSp'] + titanic_df['Parch']
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Count
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,0


In [6]:
def sex_to_gender(sex):
    if sex == 'male':
        return 0
    else:
        return 1

def cabin_to_deck(cabin):
    if str(cabin) == 'NA':
        return 'NA'
    else:
        return cabin[0]

In [7]:
titanic_df['Gender'] = titanic_df['Sex'].apply(sex_to_gender)
# titanic_df['Gender'] = map(sex_to_gender,titanic_df['Sex'])
titanic_df['Cabin'].fillna('NA', inplace =True)
#titanic_df.drop('Deck',axis=1,inplace =True)
titanic_df['Deck'] = map(cabin_to_deck,titanic_df['Cabin'])
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Count,Gender,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1,0,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1,1,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0,1,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S,1,1,C
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S,0,0,


In [8]:
# For logistic regression, we'll need to create dummies
dummy_pclass = pd.get_dummies(titanic_df['Pclass'])
dummy_embarked = pd.get_dummies(titanic_df['Embarked'])
#dummy_gender = pd.get_dummies(titanic_df['Gender'])
dummy_family = pd.get_dummies(titanic_df['Family_Count'])
dummy_deck = pd.get_dummies(titanic_df['Deck'])

# Create column names for the new DataFrames
dummy_pclass.columns = ['class1','class2','class3']
#dummy_gender.columns = ['male','female']
dummy_family.columns = ['member0','member1','member2','member3','member4','member5','member6','member7','member10']
dummy_deck.columns = ['deckA','deckB','deckC','deckD','deckE','deckF','deckG','deck_NA','deckT']

# Combining all the dummies
# dummies = pd.concat([dummy_embarked,dummy_pclass,dummy_gender,dummy_family,dummy_deck],axis=1)
dummies = pd.concat([dummy_embarked,dummy_pclass,dummy_family,dummy_deck],axis=1)
dummies.head()

Unnamed: 0,C,Q,S,class1,class2,class3,member0,member1,member2,member3,...,member10,deckA,deckB,deckC,deckD,deckE,deckF,deckG,deck_NA,deckT
0,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,1,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
dummies.columns

Index([u'C', u'Q', u'S', u'class1', u'class2', u'class3', u'member0',
       u'member1', u'member2', u'member3', u'member4', u'member5', u'member6',
       u'member7', u'member10', u'deckA', u'deckB', u'deckC', u'deckD',
       u'deckE', u'deckF', u'deckG', u'deck_NA', u'deckT'],
      dtype='object')

In [9]:
dummies.drop(['deckA','deckF','deckG','deckT','member3','member4','member5','member6','member7','member10'], axis =1, inplace= True)

In [10]:
dummies.head()

Unnamed: 0,C,Q,S,class1,class2,class3,member0,member1,member2,deckB,deckC,deckD,deckE,deck_NA
0,0,0,1,0,0,1,0,1,0,0,0,0,0,1
1,1,0,0,1,0,0,0,1,0,0,1,0,0,0
2,0,0,1,0,0,1,1,0,0,0,0,0,0,1
3,0,0,1,1,0,0,0,1,0,0,1,0,0,0
4,0,0,1,0,0,1,1,0,0,0,0,0,0,1


In [11]:
titanic_df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_Count,Gender,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,1,0,


In [29]:
# Other than the variables for which dummies are created; drop Name, Parch, SibSp, Ticket and Cabin from X
X = titanic_df.drop(['Name','Survived','PassengerId','Parch','SibSp','Ticket','Cabin','Deck','Pclass','Embarked','Sex','Family_Count'],axis=1)
X.head()

Unnamed: 0,Age,Fare,Gender
0,22,7.25,0
1,38,71.2833,1
2,26,7.925,1
3,35,53.1,1
4,35,8.05,0


In [30]:
# Now combining all features and dummies
X = pd.concat([X,dummies],axis=1)
X.head()

Unnamed: 0,Age,Fare,Gender,C,Q,S,class1,class2,class3,member0,member1,member2,deckB,deckC,deckD,deckE,deck_NA
0,22,7.25,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1
1,38,71.2833,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0
2,26,7.925,1,0,0,1,0,0,1,1,0,0,0,0,0,0,1
3,35,53.1,1,0,0,1,1,0,0,0,1,0,0,1,0,0,0
4,35,8.05,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1


In [31]:
# Now setting up the target

Y = titanic_df['Survived']
Y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [32]:
# Flatten array
Y = np.ravel(Y)

# Check result
Y[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], dtype=int64)

In [33]:
# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

# Make a new log_model
log_model = LogisticRegression()

# Now fit the new model
log_model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
# Predict the classes of the testing data set
class_predict = log_model.predict(X_test)

# Compare the predicted classes to the actual test classes
print metrics.accuracy_score(Y_test,class_predict)

0.816143497758


In [44]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223 entries, 66 to 51
Data columns (total 17 columns):
Age        223 non-null float64
Fare       223 non-null float64
Gender     223 non-null int64
C          223 non-null float64
Q          223 non-null float64
S          223 non-null float64
class1     223 non-null float64
class2     223 non-null float64
class3     223 non-null float64
member0    223 non-null float64
member1    223 non-null float64
member2    223 non-null float64
deckB      223 non-null float64
deckC      223 non-null float64
deckD      223 non-null float64
deckE      223 non-null float64
deck_NA    223 non-null float64
dtypes: float64(16), int64(1)
memory usage: 31.4 KB


In [35]:
# Use zip to bring the column names and the np.transpose function to bring together the coefficients from the model
coeff_df = DataFrame(zip(X.columns, np.transpose(log_model.coef_)))
coeff_df

Unnamed: 0,0,1
0,Age,[-0.0356036168155]
1,Fare,[0.00444757033199]
2,Gender,[2.3566912916]
3,C,[0.212544930727]
4,Q,[-0.0252150217764]
5,S,[-0.400583399802]
6,class1,[0.380583782415]
7,class2,[0.200522006442]
8,class3,[-0.794359279708]
9,member0,[0.871322615107]


In [36]:
titanic_test_df = pd.read_csv('test.csv')
titanic_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [37]:
titanic_test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 39.2+ KB


In [38]:
titanic_test_df['Age'].fillna(titanic_test_df['Age'].median(),inplace='true')
titanic_test_df['Fare'].fillna(titanic_test_df['Fare'].median(),inplace='true')

In [39]:
titanic_test_df['Gender'] = titanic_test_df['Sex'].apply(sex_to_gender)
# titanic_df['Gender'] = map(sex_to_gender,titanic_df['Sex'])
titanic_test_df['Cabin'].fillna('NA', inplace =True)
#titanic_df.drop('Deck',axis=1,inplace =True)
titanic_test_df['Deck'] = map(cabin_to_deck,titanic_test_df['Cabin'])
titanic_test_df['Family_Count'] = titanic_test_df['SibSp'] + titanic_test_df['Parch']
# titanic_df['Family'] = map(family_or_not,titanic_df['Family_Count'])
titanic_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Gender,Deck,Family_Count
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0,,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1,,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0,,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0,,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1,,2


In [40]:
# creating dummies to make features similar to training set
test_dummy_pclass = pd.get_dummies(titanic_test_df['Pclass'])
test_dummy_embarked = pd.get_dummies(titanic_test_df['Embarked'])
#test_dummy_gender = pd.get_dummies(titanic_test_df['Gender'])
test_dummy_family = pd.get_dummies(titanic_test_df['Family_Count'])
test_dummy_deck = pd.get_dummies(titanic_test_df['Deck'])

# Create column names for the new DataFrames
test_dummy_pclass.columns = ['class1','class2','class3']
#test_dummy_gender.columns = ['male','female']
test_dummy_family.columns = ['member0','member1','member2','member3','member4','member5','member6','member7','member10']
test_dummy_deck.columns = ['deckA','deckB','deckC','deckD','deckE','deckF','deckG','deck_NA']

# Combining all the dummies
# test_dummies = pd.concat([test_dummy_embarked,test_dummy_pclass,test_dummy_gender,test_dummy_family,test_dummy_deck],axis=1)
test_dummies = pd.concat([test_dummy_embarked,test_dummy_pclass,test_dummy_family,test_dummy_deck],axis=1)

# Dropping some columns to avoid multi-collinearity
test_dummies.drop(['deckA','deckF','deckG','member3','member4','member5','member6','member7','member10'], axis =1, inplace= True)

test_dummies.head()

Unnamed: 0,C,Q,S,class1,class2,class3,member0,member1,member2,deckB,deckC,deckD,deckE,deck_NA
0,0,1,0,0,0,1,1,0,0,0,0,0,0,1
1,0,0,1,0,0,1,0,1,0,0,0,0,0,1
2,0,1,0,0,1,0,1,0,0,0,0,0,0,1
3,0,0,1,0,0,1,1,0,0,0,0,0,0,1
4,0,0,1,0,0,1,0,0,1,0,0,0,0,1


In [41]:
X_Test2 = titanic_test_df.drop(['Name','PassengerId','Parch','SibSp','Ticket','Cabin','Deck','Pclass','Embarked','Sex','Family_Count'],axis=1)

In [42]:
X_Test2 = pd.concat([X_Test2,test_dummies],axis=1)
X_Test2.head()

Unnamed: 0,Age,Fare,Gender,C,Q,S,class1,class2,class3,member0,member1,member2,deckB,deckC,deckD,deckE,deck_NA
0,34.5,7.8292,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1
1,47.0,7.0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1
2,62.0,9.6875,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1
3,27.0,8.6625,0,0,0,1,0,0,1,1,0,0,0,0,0,0,1
4,22.0,12.2875,1,0,0,1,0,0,1,0,0,1,0,0,0,0,1


In [45]:
# Predict the classes of the testing data set
class_predict3 = log_model.predict(X_Test2)

In [46]:
Y_Predicted4 = Series(class_predict3)

In [47]:
Y_Predicted4.shape

(418L,)

In [48]:
final_prediction = titanic_test_df['PassengerId']
final_prediction = pd.concat([final_prediction,Y_Predicted4],axis=1)
final_prediction.columns=['PassengerId','Survived']
final_prediction.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [49]:
final_prediction['Survived'].value_counts()

0    264
1    154
Name: Survived, dtype: int64

In [50]:
final_prediction.to_csv('final_prediction_4.csv',index=False)