In [1]:
import pandas as pd
import sklearn 
import numpy as np
import os,csv

In [157]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score,cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data_path = os.path.join(os.environ['HOME'],'Documents/Random/ml-revision/titanic')

In [7]:
training_data = pd.read_csv(os.path.join(data_path,'train.csv'))

In [13]:
test_data = pd.read_csv(os.path.join(data_path,'test.csv'))

In [8]:
training_data.head(30)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [9]:
training_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [14]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
test_data.head(20)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [17]:
test_data.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [18]:
training_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Embarked         2
dtype: int64

In [40]:
#Join the two dataframes before removing NaNs and doing train test split:
dataset = pd.concat([training_data,test_data], sort = True)

In [41]:
dataset.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,,C,,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,,S,,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [42]:
dataset.isna().sum()

Age              86
Cabin          1039
Embarked          0
Fare            713
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [72]:
#Remove rows with NaNs:
#Drop columns Cabin and Fare 
dataset.drop(['Cabin','Fare'], axis = 1, inplace = True)


In [75]:
#Now, drop rows with any NaN present
filtered_dataset = dataset.dropna() #By default, drops rows with any NaN values present

In [80]:
df = filtered_dataset.copy()
df['Last Name'] = df['Name'].apply(lambda x: x.split(',')[0])

In [88]:
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0])

In [133]:
df['Survived'] = df['Survived'].apply(lambda x: int(x))

In [134]:
#Performing feature selection and training of classifier as part of pipeline(to avoid "data leakage"):

df.head()

Unnamed: 0,Age,Embarked,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Last Name,Title
0,22.0,S,0,1,3,male,1,0,A/5 21171,Braund,Mr
1,38.0,C,0,2,1,female,1,1,PC 17599,Cumings,Mrs
2,26.0,S,0,3,3,female,0,1,STON/O2. 3101282,Heikkinen,Miss
3,35.0,S,0,4,1,female,1,1,113803,Futrelle,Mrs
4,35.0,S,0,5,3,male,0,0,373450,Allen,Mr


In [135]:
#Drop Name column:
df.drop(['Name'], axis = 1, inplace = True)

KeyError: "['Name'] not found in axis"

In [150]:
#Creating model with only categorical fields:
df_new = df[['Embarked','Pclass','Sex','SibSp','Title','Survived']].copy()

In [151]:
df_encoded = df_new.apply(LabelEncoder().fit_transform)

In [152]:
df_encoded

Unnamed: 0,Embarked,Pclass,Sex,SibSp,Title,Survived
0,2,2,1,1,11,0
1,0,0,0,1,12,1
2,2,2,0,0,8,1
3,2,0,0,1,12,1
4,2,2,1,0,11,0
...,...,...,...,...,...,...
885,1,2,0,0,12,0
886,2,1,1,0,14,0
887,2,0,0,0,8,1
889,0,0,1,0,11,1


In [153]:
y = df_encoded.pop('Survived')
X = df_encoded

In [154]:
#Perform train-test split:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [155]:
#Creating pipeline:
#The pipeline steps are as follows:
#1. Choose top 3 of the 5 features using chi-square test
#2. Fit logistic regression classifier
clf = make_pipeline(SelectKBest(chi2, k=3),
                    LogisticRegression())

In [156]:
cross_val_score(clf,X_train,y_train, cv = 5).mean()



0.7734745482405513

In [158]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
# select the same features on the test set, predict, and get the test accuracy:
#X_test_selected = selector.transform(X_test)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)



0.7762237762237763