### Import Packages and Load Data

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [106]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn import set_config
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

In [3]:
titanic_training = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/train.csv')

In [125]:
titanic_test = pd.read_csv('/Users/jpmcelyea/GIT_Code/Data-Science/Kaggle-Titanic/titanic/test.csv')

### Data Preprocessing + Feature Engineering

In [5]:
titanic_training.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
##Preprocessing and Feature Engineering

titanic_training.set_index("PassengerId", inplace=True)

titanic_training[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_training['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

titanic_training['CabinLetter'] = titanic_training['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

titanic_training['Cabin1'] = np.where(titanic_training['Cabin1'].isnull(), 0, 1)
titanic_training['Cabin2'] = np.where(titanic_training['Cabin2'].isnull(), 0, 1)
titanic_training['Cabin3'] = np.where(titanic_training['Cabin3'].isnull(), 0, 1)
titanic_training['CabinNum'] = titanic_training['Cabin1'] + titanic_training['Cabin2'] + titanic_training['Cabin3']

titanic_training['CabinLetter'] = titanic_training['CabinLetter'].fillna(value = 'N')

titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].str.split(" ", 1, expand=True)

titanic_training[['TicketLetter', 'TicketNum']] = titanic_training['Ticket'].\
                str.extract(r'([\w\/\.]+)\s(\d+)')

titanic_training['TicketSpecial'] = np.where(pd.isna(titanic_training['TicketLetter']), 0 ,1)

titanic_training[['NameTitle']] = titanic_training['Name'].\
                str.extract(r'((?<=\,\s)[\w]+)')

titanic_training['NameTitle'] = np.where(titanic_training['NameTitle'].\
                                         isin(['Mr', 'Mrs', 'Miss']), 'Standard', 'Elite')

for column in ('Sex', 'Embarked','CabinLetter','Pclass', 'NameTitle'):
    titanic_training[column] = titanic_training[column].astype('category')
    
titanic_training.\
drop(columns=['Cabin1', 'Cabin2','Cabin3','Ticket', 'Name','Cabin','TicketLetter', 'TicketNum'], inplace = True)

In [7]:
nullseries = titanic_training.isnull().sum()
print(nullseries[nullseries > 0])

nullseries = titanic_training.isnull().sum()/len(titanic_training)
print(nullseries[nullseries > 0])

Age         177
Embarked      2
dtype: int64
Age         0.198653
Embarked    0.002245
dtype: float64


In [None]:
#titanic_training.describe()
#titanic_training.dtypes

In [8]:
titanic_training.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,CabinNum,TicketSpecial,NameTitle
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,male,22.0,1,0,7.25,S,N,0,1,Standard
2,1,1,female,38.0,1,0,71.2833,C,C,1,1,Standard
3,1,3,female,26.0,0,0,7.925,S,N,0,1,Standard
4,1,1,female,35.0,1,0,53.1,S,C,1,0,Standard
5,0,3,male,35.0,0,0,8.05,S,N,0,0,Standard


### Model Selection

In [9]:
X = titanic_training.drop('Survived', axis=1)
y = titanic_training['Survived']

In [101]:
xgb_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[(
             'dummies', OneHotEncoder(drop='first'), X.select_dtypes(include=['category']).columns)
              ])]
    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
    ])

In [109]:
svm_pipeline = Pipeline(steps =[
    ['preprocessing', 
     ColumnTransformer(remainder='passthrough',
     transformers=[
             ('imputer', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
             ('dummies', OneHotEncoder(), X.select_dtypes(include=['category']).columns)
              ])]
    ,('scaler', StandardScaler())
    ,('model', SVC(random_state = 123))
    ])

In [110]:
scores = cross_val_score(svm_pipeline, X, y, 
                        scoring=('accuracy'), cv=5)

scores.mean()

0.8294143493817087

In [93]:
#xgb_pipeline = Pipeline(steps =[
#    ['preprocessing', 
#     ColumnTransformer(remainder='passthrough',
#     transformers=[
#             ('dummies', OneHotEncoder(drop='first'), ['Pclass', 'Embarked', 'CabinLetter']),
#             ('ordinal', OrdinalEncoder(), ['Sex', 'NameTitle']),
#              ])]
#    ,('model', xgb.XGBClassifier(use_label_encoder=False, verbosity=0, random_state=123))
#    ])

### Tune the Model

In [65]:
#param_grid = {'model__max_depth': [8,12,14],
#             'model__n_estimators': [20,30],
#             'model__learning_rate':[0.2,0.25,0.3],
#             'model__colsample_bytree': [0.8,0.9,1]}

In [102]:
param_grid = {'model__max_depth': [7,8,9,10],
             'model__n_estimators': [20,50],
             'model__learning_rate':[0.04,0.05,0.06]}

In [103]:
grid_xgb = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_xgb.fit(X, y);

In [104]:
print(grid_xgb.best_score_)
print(grid_xgb.best_params_)
xgb_model_best = grid_xgb.best_estimator_;
print(xgb_model_best.score(X, y))

0.8383770287141074
{'model__learning_rate': 0.05, 'model__max_depth': 9, 'model__n_estimators': 50}
0.9180695847362514


In [None]:
0.8383770287141074
{'model__learning_rate': 0.05, 'model__max_depth': 9, 'model__n_estimators': 50}
0.9180695847362514

In [71]:
# Read the cv_results property into a dataframe & print it out
#cv_results_df = pd.DataFrame(grid_xgb.cv_results_)
#print(cv_results_df)

# Extract and print the column with a dictionary of hyperparameters used
#column = cv_results_df.loc[:, ['params']]
#print(column)

In [121]:
param_grid = {'model__C': [4.28],
             'model__gamma': ['scale','auto']}

In [122]:
grid_svm = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=param_grid,
    n_jobs = -1,
    scoring = 'accuracy',
    cv=10)

grid_svm.fit(X, y);

In [123]:
print(grid_svm.best_score_)
print(grid_svm.best_params_)
svm_model_best = grid_svm.best_estimator_;
print(svm_model_best.score(X, y))

0.838414481897628
{'model__C': 4.28, 'model__gamma': 'scale'}
0.8765432098765432


In [None]:
0.838414481897628
{'model__C': 4.281332398719396, 'model__gamma': 'scale'}
0.8765432098765432

### Fit Model To Test Data

In [126]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [127]:
##Preprocessing and Feature Engineering

titanic_test[['Cabin1', 'Cabin2', 'Cabin3']] = titanic_test['Cabin'].\
                str.extract(r'(?P<Cabin1>[A-Z]+\d+)\s?(?P<Cabin2>[A-Z]+\d+)?\s?(?P<Cabin3>[A-Z]+\d+)?')

titanic_test['CabinLetter'] = titanic_test['Cabin1'].str.extract(r'(?P<Cabin1_Code>[A-Z]+)')

titanic_test['Cabin1'] = np.where(titanic_test['Cabin1'].isnull(), 0, 1)
titanic_test['Cabin2'] = np.where(titanic_test['Cabin2'].isnull(), 0, 1)
titanic_test['Cabin3'] = np.where(titanic_test['Cabin3'].isnull(), 0, 1)
titanic_test['CabinNum'] = titanic_test['Cabin1'] + titanic_test['Cabin2'] + titanic_test['Cabin3']

titanic_test['CabinLetter'] = titanic_test['CabinLetter'].fillna(value = 'N')

titanic_test[['TicketLetter', 'TicketNum']] = titanic_test['Ticket'].str.split(" ", 1, expand=True)

titanic_test[['TicketLetter', 'TicketNum']] = titanic_test['Ticket'].\
                str.extract(r'([\w\/\.]+)\s(\d+)')

titanic_test['TicketSpecial'] = np.where(pd.isna(titanic_test['TicketLetter']), 0 ,1)

titanic_test[['NameTitle']] = titanic_test['Name'].\
                str.extract(r'((?<=\,\s)[\w]+)')

titanic_test['NameTitle'] = np.where(titanic_test['NameTitle'].\
                                         isin(['Mr', 'Mrs', 'Miss']), 'Standard', 'Elite')

for column in ('Sex', 'Embarked','CabinLetter','Pclass', 'NameTitle'):
    titanic_test[column] = titanic_test[column].astype('category')
    
titanic_test.\
drop(columns=['Cabin1', 'Cabin2','Cabin3','Ticket', 'Name','Cabin','TicketLetter', 'TicketNum'], inplace = True)

In [128]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,CabinLetter,CabinNum,TicketSpecial,NameTitle
0,892,3,male,34.5,0,0,7.8292,Q,N,0,0,Standard
1,893,3,female,47.0,1,0,7.0,S,N,0,0,Standard
2,894,2,male,62.0,0,0,9.6875,Q,N,0,0,Standard
3,895,3,male,27.0,0,0,8.6625,S,N,0,0,Standard
4,896,3,female,22.0,1,1,12.2875,S,N,0,0,Standard


### Final Submission

In [129]:
#convert output to dataframe 
final_data = {'PassengerId': titanic_test.PassengerId, 'Survived': svm_model_best.predict(titanic_test.drop('PassengerId', axis = 1))}
submission = pd.DataFrame(data=final_data)

In [130]:
titanic_training.Survived.value_counts()/len(titanic_training.Survived)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [131]:
submission.Survived.value_counts()/len(submission.Survived)

0    0.65311
1    0.34689
Name: Survived, dtype: float64

In [132]:
submission.shape
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [133]:
submission.to_csv('submission_8.csv', index =False)