### Import Packages and Load Data

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import set_config
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score,cross_validate
import xgboost as xgb

In [3]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [4]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Data Preprocessing + Feature Engineering

In [5]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic_training.set_index("PassengerId", inplace=True)

In [7]:
titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [8]:
titanic_training.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [9]:
titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [10]:
titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

In [11]:
titanic_training['MultipleCabins']=np.where(titanic_training['CabinNum'] > 1, 1, 0)

In [12]:
titanic_training.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [13]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,,0
2,1,1,female,38.0,1,0,71.2833,C,C,0
3,1,3,female,26.0,0,0,7.925,S,,0
4,1,1,female,35.0,1,0,53.1,S,C,0
5,0,3,male,35.0,0,0,8.05,S,,0


In [14]:
nullseries = titanic_training.isnull().sum()
nullseries[nullseries > 0]

Age            177
Embarked         2
CabinLetter    691
dtype: int64

In [15]:
nullseries = titanic_training.isnull().sum()/len(titanic_training)
nullseries[nullseries > 0]

Age            0.198653
Embarked       0.002245
CabinLetter    0.775533
dtype: float64

In [16]:
titanic_training.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,MultipleCabins
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,0.022447
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,0.148214
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,0.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,0.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,0.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,1.0


In [17]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_training[column] = titanic_training[column].astype('category')

In [18]:
titanic_training.dtypes

Survived             int64
Pclass            category
Sex               category
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked          category
CabinLetter       category
MultipleCabins       int64
dtype: object

### Model Selection

In [19]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [20]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state = 202)

In [21]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [22]:
scores = cross_val_score(knn_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

0.7944774193548387

In [23]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(test_X, test_y)

0.7798507462686567

In [24]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [25]:
scores = cross_val_score(rf_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.7849462365591398

In [26]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(test_X, test_y)

0.8022388059701493

In [27]:
logit_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('imputer', SimpleImputer(strategy='mean'))
    ,('model', LogisticRegression(max_iter=1000,random_state=123))
    ])

In [28]:
scores = cross_val_score(logit_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8202764976958525

In [29]:
logit_pipeline.fit(train_X, train_y)
logit_pipeline.score(test_X, test_y)

0.7761194029850746

In [30]:
xgb_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
    ])

In [31]:
scores = cross_val_score(xgb_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8090373783922171

In [32]:
xgb_pipeline.fit(train_X, train_y)
xgb_pipeline.score(test_X, test_y)

0.7910447761194029

### Fit Model To Test Data

In [33]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [34]:
titanic_test[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_test['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

In [35]:
titanic_test.drop(columns=['Ticket', 'Name','Cabin'], inplace = True)

In [36]:
titanic_test['CabinLetter'] = titanic_test['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

In [37]:
titanic_test['Cabin1'] = np.where(titanic_test['Cabin1'].isnull(), 0, 1)
titanic_test['Cabin2'] = np.where(titanic_test['Cabin2'].isnull(), 0, 1)
titanic_test['Cabin3'] = np.where(titanic_test['Cabin3'].isnull(), 0, 1)
titanic_test['CabinNum'] = titanic_test['Cabin1'] + titanic_test['Cabin2'] + titanic_test['Cabin3']

In [38]:
titanic_test['MultipleCabins']=np.where(titanic_test['CabinNum'] > 1, 1, 0)

In [39]:
titanic_test.drop(columns=['Cabin1', 'Cabin2','Cabin3','CabinNum'], inplace = True)

In [40]:
for column in ('Sex', 'Embarked','CabinLetter','Pclass'):
    titanic_test[column] = titanic_test[column].astype('category')

In [41]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,MultipleCabins
0,892,3,male,34.5,0,0,7.8292,Q,,0
1,893,3,female,47.0,1,0,7.0,S,,0
2,894,2,male,62.0,0,0,9.6875,Q,,0
3,895,3,male,27.0,0,0,8.6625,S,,0
4,896,3,female,22.0,1,1,12.2875,S,,0


In [42]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': xgb_pipeline.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [43]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [53]:
submission.to_csv('submission_1.csv', index =False)

In [52]:
titanic_training.Survived.value_counts()/len(titanic_training.Survived)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [50]:
submission.Survived.value_counts()/len(submission.Survived)

0    0.62201
1    0.37799
Name: Survived, dtype: float64

In [54]:
submission.shape

(418, 2)

In [60]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': logit_pipeline.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [62]:
submission.to_csv('submission_2.csv', index =False)

In [61]:
submission.Survived.value_counts()/len(submission.Survived)

0    0.600478
1    0.399522
Name: Survived, dtype: float64

In [None]:
##Example Submission
#example_submission = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/gender_submission.csv')
#example_submission.head()