### Import Packages and Load Data

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import set_config
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

In [3]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [4]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Data Preprocessing + Feature Engineering

In [5]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
##Preprocessing and Feature Engineering

titanic_training.set_index("PassengerId", inplace=True)

titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

titanic_training['CabinLetter'] = titanic_training['CabinLetter'].fillna(value = 'N')

titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].str.split(" ", 1, expand=True)

titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].\
                str.extract(r'([\w\/\.]+)\s(\d+)')

titanic_training['TicketSpecial'] = np.where(pd.isna(titanic_training['TicketLetter']), 0 ,1)

titanic_training[['NameTitle']] = titanic_training['Name'].\
                str.extract(r'((?<=\,\s)[\w]+)')

titanic_training['NameTitle'] = np.where(titanic_training['NameTitle'].\
                                         isin(['Mr', 'Mrs', 'Miss']), 'Standard', 'Elite')

for column in ('Sex', 'Embarked','CabinLetter','Pclass', 'NameTitle'):
    titanic_training[column] = titanic_training[column].astype('category')
    
titanic_training.\
drop(columns=['Cabin1', 'Cabin2','Cabin3','Ticket', 'Name','Cabin','TicketLetter', 'TicketNum'], inplace = True)

In [7]:
nullseries = titanic_training.isnull().sum()
print(nullseries[nullseries > 0])

nullseries = titanic_training.isnull().sum()/len(titanic_training)
print(nullseries[nullseries > 0])

Age         177
Embarked      2
dtype: int64
Age         0.198653
Embarked    0.002245
dtype: float64


In [8]:
#titanic_training.describe()
#titanic_training.dtypes

In [9]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,CabinNum,TicketSpecial,NameTitle
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,male,22.0,1,0,7.25,S,N,0,1,Standard
2,1,1,female,38.0,1,0,71.2833,C,C,1,1,Standard
3,1,3,female,26.0,0,0,7.925,S,N,0,1,Standard
4,1,1,female,35.0,1,0,53.1,S,C,1,0,Standard
5,0,3,male,35.0,0,0,8.05,S,N,0,0,Standard


### Model Selection

In [10]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state = 202)

In [12]:
knn_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('scaler', StandardScaler())
    ,('model', KNeighborsClassifier())
    ])

In [13]:
scores = cross_val_score(knn_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

0.8137419354838709

In [14]:
knn_pipeline.fit(train_X, train_y)
knn_pipeline.score(test_X, test_y)

0.7723880597014925

In [15]:
rf_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', RandomForestClassifier(random_state =123))
    ])

In [16]:
scores = cross_val_score(rf_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8057091653865847

In [17]:
rf_pipeline.fit(train_X, train_y)
rf_pipeline.score(test_X, test_y)

0.8059701492537313

In [18]:
logit_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', LogisticRegression(max_iter=1000,random_state=123))
    ])

In [19]:
scores = cross_val_score(logit_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8299027137736814

In [20]:
logit_pipeline.fit(train_X, train_y)
logit_pipeline.score(test_X, test_y)

0.7873134328358209

In [21]:
xgb_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
    ])

In [22]:
scores = cross_val_score(xgb_pipeline, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8042754736303124

In [23]:
xgb_pipeline.fit(train_X, train_y)
xgb_pipeline.score(test_X, test_y)

0.8059701492537313

### Tune the Model

In [24]:
param_grid = {'model__n_neighbors': list(range(1, 10))}

In [25]:
grid_knn = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_knn.fit(train_X, train_y);

In [26]:
print(grid_knn.best_score_)
print(grid_knn.best_params_)
knn_model_best = grid_knn.best_estimator_;
print(knn_model_best.score(test_X, test_y))

0.8299283154121863
{'model__n_neighbors': 5}
0.7723880597014925


In [27]:
param_grid = {
'model__n_estimators': [100,350,500],
'model__max_features': ['log2','auto','sqrt'],
'model__min_samples_leaf': list(range(2, 30)),
'model__max_depth': list(range(1, 10))
    
}

In [28]:
random_rf = RandomizedSearchCV(
        estimator=rf_pipeline,
        param_distributions = param_grid, 
        n_iter = 25,
        n_jobs= -1, 
        scoring='accuracy', 
        cv = 10)

random_rf.fit(train_X, train_y);

In [29]:
print(random_rf.best_score_)
print(random_rf.best_params_)
random_rf_model_best = random_rf.best_estimator_;
print(random_rf_model_best.score(test_X, test_y))

0.8219150025601639
{'model__n_estimators': 500, 'model__min_samples_leaf': 3, 'model__max_features': 'auto', 'model__max_depth': 7}
0.7761194029850746


In [30]:
param_grid = {
'model__n_estimators': [25,50,100],
'model__max_features': ['sqrt'],
'model__min_samples_leaf': [2,3,4,5,6,7,8],
'model__max_depth': [4,5,6]
}

In [31]:
grid_rf = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_rf.fit(train_X, train_y);

In [32]:
print(grid_rf.best_score_)
print(grid_rf.best_params_)
rf_model_best = grid_rf.best_estimator_;
print(rf_model_best.score(test_X, test_y))

0.8347414234511008
{'model__max_depth': 4, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__n_estimators': 50}
0.7873134328358209


In [33]:
param_grid = {'model__penalty' : ['l1', 'l2'],
    'model__C' : np.logspace(-4, 4, 20),
    'model__solver' : ['liblinear']}

In [34]:
grid_logit = GridSearchCV(
    estimator=logit_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_logit.fit(train_X, train_y);

In [35]:
print(grid_logit.best_score_)
print(grid_logit.best_params_)
logit_model_best = grid_logit.best_estimator_;
print(logit_model_best.score(test_X, test_y))

0.8331541218637992
{'model__C': 4.281332398719396, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
0.7761194029850746


In [36]:
param_grid = {'model__learning_rate': np.arange(0.05, 1, 0.05),
    'model__max_depth': np.arange(3, 10, 1),
    'model__n_estimators': np.arange(50, 200, 50)}

In [37]:
random_xgb = RandomizedSearchCV(
        estimator=xgb_pipeline,
        param_distributions = param_grid, 
        n_iter = 25,
        n_jobs= -1, 
        scoring='accuracy', 
        cv = 10)

random_xgb.fit(train_X, train_y);

In [38]:
print(random_xgb.best_score_)
print(random_xgb.best_params_)
random_xgb_model_best = random_xgb.best_estimator_;
print(random_xgb_model_best.score(test_X, test_y))

0.8363543266769072
{'model__n_estimators': 100, 'model__max_depth': 3, 'model__learning_rate': 0.1}
0.8097014925373134


In [39]:
param_grid = {'model__learning_rate': [0.20, 0.25,0.40],
    'model__max_depth': [2,3,4,5],
    'model__n_estimators': [20,30,50,100]}

In [40]:
grid_xgb = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_xgb.fit(train_X, train_y);

In [41]:
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)
xgb_model_best = grid_xgb.best_estimator_;
print(xgb_model_best.score(test_X, test_y))

0.8475934459805428
{'model__learning_rate': 0.2, 'model__max_depth': 2, 'model__n_estimators': 100}
0.8097014925373134


### Create Ensemble Model

In [42]:
ensemble = VotingClassifier(estimators=[
        ('knn', knn_model_best), 
        ('lr', logit_model_best), 
        ('rf', rf_model_best), 
        ('xgb', xgb_model_best)],
        voting='soft')

In [43]:
ensemble.fit(train_X, train_y);

In [44]:
scores = cross_val_score(ensemble, train_X, train_y, 
                        scoring=('accuracy'), cv=10)
scores.mean()

0.8459805427547362

In [45]:
ensemble.score(test_X, test_y)

0.7947761194029851

### Fit Model To Test Data

In [46]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [47]:
##Preprocessing and Feature Engineering

titanic_test[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_test['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

titanic_test['CabinLetter'] = titanic_test['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

titanic_test['Cabin1'] = np.where(titanic_test['Cabin1'].isnull(), 0, 1)
titanic_test['Cabin2'] = np.where(titanic_test['Cabin2'].isnull(), 0, 1)
titanic_test['Cabin3'] = np.where(titanic_test['Cabin3'].isnull(), 0, 1)
titanic_test['CabinNum'] = titanic_test['Cabin1'] + titanic_test['Cabin2'] + titanic_test['Cabin3']

titanic_test['CabinLetter'] = titanic_test['CabinLetter'].fillna(value = 'N')

titanic_test[['TicketLetter', 'TicketNum']] = titanic_test['Ticket'].str.split(" ", 1, expand=True)

titanic_test[['TicketLetter', 'TicketNum']] = titanic_test['Ticket'].\
                str.extract(r'([\w\/\.]+)\s(\d+)')

titanic_test['TicketSpecial'] = np.where(pd.isna(titanic_test['TicketLetter']), 0 ,1)

titanic_test[['NameTitle']] = titanic_test['Name'].\
                str.extract(r'((?<=\,\s)[\w]+)')

titanic_test['NameTitle'] = np.where(titanic_test['NameTitle'].\
                                         isin(['Mr', 'Mrs', 'Miss']), 'Standard', 'Elite')

for column in ('Sex', 'Embarked','CabinLetter','Pclass', 'NameTitle'):
    titanic_test[column] = titanic_test[column].astype('category')
    
titanic_test.\
drop(columns=['Cabin1', 'Cabin2','Cabin3','Ticket', 'Name','Cabin','TicketLetter', 'TicketNum'], inplace = True)

In [48]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,CabinNum,TicketSpecial,NameTitle
0,892,3,male,34.5,0,0,7.8292,Q,N,0,0,Standard
1,893,3,female,47.0,1,0,7.0,S,N,0,0,Standard
2,894,2,male,62.0,0,0,9.6875,Q,N,0,0,Standard
3,895,3,male,27.0,0,0,8.6625,S,N,0,0,Standard
4,896,3,female,22.0,1,1,12.2875,S,N,0,0,Standard


### Final Submission

In [54]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': xgb_model_best.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [55]:
titanic_training.Survived.value_counts()/len(titanic_training.Survived)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [56]:
submission.Survived.value_counts()/len(submission.Survived)

0    0.641148
1    0.358852
Name: Survived, dtype: float64

In [57]:
submission.shape
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [58]:
submission.to_csv('submission_4.csv', index =False)