In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data = pd.read_csv('/kaggle/input/titanic/train.csv',index_col = 'PassengerId')
data_test = pd.read_csv('/kaggle/input/titanic/test.csv',index_col = 'PassengerId')

In [3]:
data.head(3)

# Data Cleaning & Feature Engineering

In [4]:
data.isna().sum()

In [5]:
data_test.isna().sum()

## Drop High Cardinal Categorical Features

In [6]:
data['Ticket'].value_counts()

In [7]:
data['Cabin'].value_counts()

In [8]:
data.drop(['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
data_test.drop(['Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)

## Encode Categorical Features

In [9]:
data.groupby('Sex')['Survived'].mean()

Female passengers have more chance to survive.

In [10]:
data['Male'] = (data['Sex']=='male').astype(int)

data_test['Male'] = (data_test['Sex']=='male').astype(int)

In [11]:
data['Embarked'].value_counts()

In [12]:
data['Embarked'].fillna('S',inplace = True)

In [13]:
data.groupby('Embarked')['Survived'].mean()

In [14]:
data.groupby('Embarked')['Fare'].mean()

In [15]:
data['log_Fare'] = data['Fare'].apply(lambda x: np.log(x+1))

data_test['Fare'].fillna(data_test['Fare'].median(),inplace = True)
data_test['log_Fare'] = data_test['Fare'].apply(lambda x: np.log(x+1))

In [16]:
sns.set_theme(style = 'ticks',palette = 'colorblind')
sns.boxplot(data = data, x='Survived',y='log_Fare')
sns.despine()
plt.show()

Where people embarked also affects the chance of survival, but that's probably because the fares are different, higher price leads to higher chance of survival.

In [17]:
data['Embarked_S'] = (data['Embarked']=='S').astype(int)
data['Embarked_C'] = (data['Embarked']=='C').astype(int)

data_test['Embarked_S'] = (data_test['Embarked']=='S').astype(int)
data_test['Embarked_C'] = (data_test['Embarked']=='C').astype(int)

In [18]:
data.groupby('SibSp')['Survived'].agg(['mean','count'])

In [19]:
data.groupby('Parch')['Survived'].agg(['mean','count'])

In [20]:
feature_cols = ['Age','Male','Pclass','SibSp', 'Parch', 'log_Fare', 'Embarked_S', 'Embarked_C']
X = data.loc[:,feature_cols]
y = data['Survived']
X_test = data_test.loc[:,feature_cols]

## Handle Missing Values

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=20, random_state=0)

imp_cols = imp.fit_transform(X)
X['Age'] = imp_cols[:,0]

imp_cols_test = imp.fit_transform(X_test)
X_test['Age'] = imp_cols_test[:,0]

In [22]:
sns.set_theme(style = 'ticks',palette = 'colorblind')
sns.boxplot(data = data, x='Survived',y='Age')
sns.despine()
plt.show()

Age does not affect the chance of survival significantly.

In [23]:
X.head(3)

In [24]:
X_test.head(3)

# Modeling

In [25]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier,XGBRFClassifier

models = []
models.append(('LR', LogisticRegression(solver = 'liblinear',random_state=0)))
models.append(('SVM', SVC(random_state=0)))
models.append(('RFC', RandomForestClassifier(random_state=0)))
models.append(('XGB', XGBClassifier(random_state=0)))
models.append(('XGBRF', XGBRFClassifier(random_state=0)))


In [26]:
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=10)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
# boxplot algorithm comparison
fig = plt.figure(figsize = (12,6))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
sns.boxplot(data = results)
ax.set_xticklabels(names)
sns.despine()
plt.show()

## XGBoost Random Forest Classifier

In [27]:
from sklearn.model_selection import GridSearchCV

xgbrfc = XGBRFClassifier(random_state = 0)
params = [{'n_estimators': [120,130,140,150],
           'subsample':[0.9,0.8,0.7,0.6],
           'colsample_bynode': [0.5,0.75,1]}]

gs_xgb = GridSearchCV(xgbrfc,
                      param_grid=params,
                      scoring='accuracy',
                      cv=5,
                      verbose = 1)
gs_xgb.fit(X, y)
print('Best Params: ',gs_xgb.best_params_)
print('Best Scores: ',gs_xgb.best_score_)

# Prediction

In [28]:
xgbrfc = XGBRFClassifier(n_estimators = 140,
                         subsample = 0.7,
                         colsample_bynode = 1,
                         random_state = 0).fit(X, y)

y_test = xgbrfc.predict(X_test)

In [29]:
submission = pd.DataFrame(y_test,index = X_test.index,columns=['Survived']).reset_index()
submission

In [30]:
submission.to_csv('submission.csv',index = False)