# Setting Up

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, KBinsDiscretizer, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.optimizers import Adam

warnings.filterwarnings('ignore')

In [2]:
files = []
for dirname, _, filenames in os.walk('kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
        
train_df = pd.read_csv(files[0])
test_df = pd.read_csv(files[1])
gender_df = pd.read_csv(files[2])

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Exploratory Data Analysis/Preprocessing

## Statistical summary

In [5]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# Null values: We can see that Age, Cabin, and Embarked have null values
# We will handle these null values as we preprocess each column at a time
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Feature engineering

**Deleting PassengerId column, since it is merely an index, so not necessary**

In [8]:
train_df.drop('PassengerId', axis=1, inplace=True)
test_df.drop('PassengerId', axis=1, inplace=True)

**Preprocessing information from Embarked**

There are only 3 categorical variables for embarked. We can keep this info. One-hot encoding will be done later on this.

In [9]:
train_df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
# Replace null values with the most frequent element
imp_majority = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_df[['Embarked']] = imp_majority.fit_transform(train_df[['Embarked']])

**Creating a new categorical variable for AgeGroups, derived from Age**

Source: https://integrishealth.org/resources/on-your-health/2015/october/stages-of-life-health-for-every-age

In [11]:
# Replacing all null ages to the average age
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
train_df[['Age']] = imp_mean.fit_transform(train_df[['Age']])
test_df[['Age']] = imp_mean.transform(test_df[['Age']])

In [12]:
train_df['Age'].min(), train_df['Age'].max()

(0.42, 80.0)

In [13]:
def get_age_group(age):
    if age >= 0 and age <= 1:
        return 0 # Infant
    elif age >= 2 and age <= 4:
        return 1 # Toddler
    elif age >= 5 and age <= 12:
        return 2 # Child
    elif age >= 13 and age <= 19:
        return 3 # Teenager
    elif age >= 20 and age <= 39:
        return 4 # Young adult
    elif age >= 40 and age <= 59:
        return 5 # Middle age adult
    elif age >= 60:
        return 6 # Senior

# Extract 'Age' feature to get 10 different bins (Age groups)
train_df['Age Group'] = train_df['Age'].apply(get_age_group)
test_df['Age Group'] = test_df['Age'].apply(get_age_group)

In [14]:
train_df.drop('Age', axis=1, inplace=True)
test_df.drop('Age', axis=1, inplace=True)

In [15]:
train_df['Age Group'].unique()

array([4, 5, 1, 3, 2, 6, 0])

**Creating new feature for Family Size (Parch + SibSp + Person themself)**

In [16]:
train_df['Family Size'] = train_df['Parch'] + train_df['SibSp'] + 1
test_df['Family Size'] = test_df['Parch'] + test_df['SibSp'] + 1

In [17]:
train_df.drop(['Parch', 'SibSp'], axis=1, inplace=True)
test_df.drop(['Parch', 'SibSp'], axis=1, inplace=True)

In [18]:
train_df['Family Size'].unique()

array([ 2,  1,  5,  3,  7,  6,  4,  8, 11])

**Create different categories for Fare, representing different fare groups**

In [None]:
train_df['Fare'].min(), train_df['Fare'].max()

In [None]:
train_df['Fare'].value_counts()

In [None]:
# Replacing all null Fares to the average age
imp_mean_2 = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean_2.fit(train_df[['Fare']])
test_df[['Fare']] = imp_mean_2.transform(test_df[['Fare']])

In [None]:
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

train_df['Fare Bins'] = kbins.fit_transform(train_df[['Fare']])
test_df['Fare Bins'] = kbins.transform(test_df[['Fare']])

In [None]:
train_df.drop('Fare', inplace=True, axis=1)
test_df.drop('Fare', inplace=True, axis=1)

In [None]:
train_df['Fare Bins'].unique()

**Extracting titles from each passenger's name**

In [None]:
train_df['Title'] = train_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [None]:
train_df.drop('Name', inplace=True, axis=1)
test_df.drop('Name', inplace=True, axis=1)

In [None]:
train_df['Title'].value_counts().sort_index()

In [None]:
test_df['Title'].value_counts().sort_index()

In [None]:
# Some passengers based on title are from military or noble class.
# In addition, there is also a mix of French/English female titles (Miss, Mrs, Ms, Mlle, Mme). We will convert French terms to English equivalence
# Let's group these together

replace_map = {
    'Capt': 'Military',
    'Col': 'Military',
    'Don': 'Noble',
    'Dona': 'Noble',
    'Jonkheer': 'Noble',
    'Lady': 'Noble',
    'Major': 'Military',
    'Mlle': 'Miss', # Mlle (Mademoiselle) - French title for unmarried woman
    'Mme': 'Mrs', # Mme (Madame) - French title for married woman
    'Sir': 'Noble',
    'the Countess': 'Noble'
}

train_df['Title'] = train_df['Title'].replace(replace_map)
test_df['Title'] = test_df['Title'].replace(replace_map)

**Extracting Ticket number from Ticket strings, and getting ticket number length**

Extracting ticket number

In [None]:
train_df['Ticket Number'] = train_df['Ticket'].apply(lambda x: x.split()[-1]) # Extracting numbers from ticket string
train_df['Ticket Number Length'] = train_df['Ticket Number'].apply(lambda x: len(x)).astype(np.int32)
train_df['Ticket Number Start'] = train_df['Ticket Number'].apply(lambda x: x[0]) # Starting value of ticket number

test_df['Ticket Number'] = test_df['Ticket'].apply(lambda x: x.split()[-1])
test_df['Ticket Number Length'] = test_df['Ticket Number'].apply(lambda x: len(x)).astype(np.int32)
test_df['Ticket Number Start'] = test_df['Ticket Number'].apply(lambda x: x[0])

In [None]:
train_df[['Ticket', 'Ticket Number', 'Ticket Number Length', 'Ticket Number Start']].head()

In [None]:
print(train_df['Ticket Number Length'].unique())
print(train_df['Ticket Number Start'].unique())

In [None]:
print(test_df['Ticket Number Length'].unique())
print(test_df['Ticket Number Start'].unique())

Analyzing prefixes of each ticket
 - There are too many unique categories for ticket prefixes (*High cardinality*)... For this notebook, I will not extract anything out of this. Prefixes may or may not be useful, but that is something that can be further researched in the future...

In [None]:
temp = train_df['Ticket'].to_frame()
temp['Ticket Content Amount'] = temp['Ticket'].apply(lambda x: len(x.split()))

In [None]:
# Analyzing values of Ticket Content Amount = 2
print(temp[temp['Ticket Content Amount'] == 2]['Ticket'].apply(lambda x: x.split()[0]).unique())

In [None]:
# Analyzing values of Ticket Content Amount = 3
print(temp[temp['Ticket Content Amount'] == 3]['Ticket'].apply(lambda x: x.split()[0]).unique())

print(temp[temp['Ticket Content Amount'] == 3]['Ticket'].apply(lambda x: x.split()[1]).unique())

 Drop Ticket column, now that we've extracted out of it

In [None]:
train_df.drop(['Ticket', 'Ticket Number'], axis=1, inplace=True)
test_df.drop(['Ticket', 'Ticket Number'], axis=1, inplace=True)

**Extracting information from Cabin**

In [None]:
# Replace null values with the most frequent element
imp_majority_2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
train_df[['Cabin']] = imp_majority_2.fit_transform(train_df[['Cabin']])
test_df[['Cabin']] = imp_majority_2.transform(test_df[['Cabin']])

Notice: Some members are assigned either 0, 1, 2, 3, or 4 cabins. This may be a useful feature to extract from

In [None]:
train_df['Cabin Amount'] = train_df['Cabin'].apply(lambda x: len(x.split())).astype(np.int32)
test_df['Cabin Amount'] = test_df['Cabin'].apply(lambda x: len(x.split())).astype(np.int32)

In [None]:
train_df['Cabin Amount'].unique()

In [None]:
test_df['Cabin Amount'].unique()

Notice how passengers owning multiple cabins all have cabins falling under the same deck (First character of each cabin). We can extract the letter from each cabin, which serves as the Deck. However, pay close attention to the ones owning 2 cabins... There are 4 strange columns where each string starts with the letter "F", followed by a space, and another cabin name.
 - For example, `F G73` in column 75.
  
The F Deck may be important, so we will keep this value for now ([Wikepedia on Titanic Decks](https://en.wikipedia.org/wiki/Titanic#:~:text=F%20Deck%2C%20the%20middle%20deck,pool%20and%20the%20Turkish%20bath.))

In [None]:
pd.concat([
    train_df[(train_df['Cabin'].str.startswith('F')) & (train_df['Cabin Amount'] == 2)],
    test_df[(test_df['Cabin'].str.startswith('F')) & (test_df['Cabin Amount'] == 2)]],
    axis=0
)

In [None]:
train_df['Deck'] = train_df['Cabin'].apply(lambda x : re.compile("([a-zA-Z]+)").search(x).group())
test_df['Deck'] = test_df['Cabin'].apply(lambda x : re.compile("([a-zA-Z]+)").search(x).group())

In [None]:
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

In [None]:
train_df['Deck'].unique()

## Bar Graphs

**View total amount of people passengers who did/did not survive**

In [None]:
fig, axes = plt.subplots()
sns.countplot(x='Survived', data=train_df)
axes.set_title('Total Passengers who Did/Did Not Survive')

**View distribution of each group of features: Survived, Pclass, Sex, Embarked, Age Group, Family Size, Fare Bins, Title**

In [None]:
# -------add_suffix---------------------- Define the correct order for the age groups -----------------------------
age_custom_labels = [
    'Infant', 'Toddler', 'Child', 'Teen', 'Young Adult', 'Mid. Age Adult', 'Senior'
]

# ----------------------------- Create custom labels for Fare bins -----------------------------
def create_custom_labels(bin_edges):
    res = []
    
    for i in range(len(bin_edges) - 1):
        res.append(f'[{bin_edges[i]}, {bin_edges[i+1]})')
    
    return np.array(res)

fare_bin_edges = np.round(kbins.bin_edges_[0], 2)
fare_custom_labels = create_custom_labels(fare_bin_edges)

# ----------------------------- Creating plots -----------------------------
fig, axes = plt.subplots(6, 2, figsize=(20, 20))
sns.countplot(x='Pclass', data=train_df, ax=axes[0, 0])
sns.countplot(x='Sex', data=train_df, ax=axes[0, 1])
sns.countplot(x='Embarked', data=train_df, ax=axes[1, 0])
sns.countplot(x='Age Group', data=train_df, ax=axes[1, 1], order=[0, 1, 2, 3, 4, 5, 6])
sns.countplot(x='Family Size',data=train_df, ax=axes[2, 0])
sns.countplot(x='Fare Bins',data=train_df, ax=axes[2, 1], order=[0, 1, 2, 3, 4])
sns.countplot(x='Title',data=train_df, ax=axes[3, 0])
sns.countplot(x='Ticket Number Length',data=train_df, ax=axes[3, 1])
sns.countplot(x='Ticket Number Start',data=train_df, ax=axes[4, 0], order=['L', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
sns.countplot(x='Cabin Amount',data=train_df, ax=axes[4, 1])
sns.countplot(x='Deck',data=train_df, ax=axes[5, 0], order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])

axes[1, 1].set_xticklabels(age_custom_labels);
axes[2, 1].set_xticklabels(fare_custom_labels);

fig.delaxes(axes[5, 1])

**How many people did or did not survive based on our categories? Let's visualize this below....**

In [None]:
fig, axes = plt.subplots(6, 2, figsize=(20, 20))

sns.barplot(x='Pclass', y='Survived', data=train_df, ax=axes[0, 0])
sns.barplot(x='Sex', y='Survived', data=train_df, ax=axes[0, 1])
sns.barplot(x='Embarked', y='Survived', data=train_df, ax=axes[1, 0])
sns.barplot(x='Age Group', y='Survived', data=train_df, ax=axes[1, 1], order=[0, 1, 2, 3, 4, 5, 6])
sns.barplot(x='Family Size', y='Survived', data=train_df, ax=axes[2, 0])
sns.barplot(x='Fare Bins', y='Survived', data=train_df, ax=axes[2, 1], order=[0,1,2,3,4])
sns.barplot(x='Title', y='Survived', data=train_df, ax=axes[3, 0])
sns.barplot(x='Ticket Number Length',y='Survived', data=train_df, ax=axes[3, 1])
sns.barplot(x='Ticket Number Start',y='Survived', data=train_df, ax=axes[4, 0], order=['L', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
sns.barplot(x='Cabin Amount',y='Survived', data=train_df, ax=axes[4, 1])
sns.barplot(x='Deck',y='Survived', data=train_df, ax=axes[5, 0], order=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])

axes[1, 1].set_xticklabels(age_custom_labels);
axes[2, 1].set_xticklabels(fare_custom_labels);

fig.delaxes(axes[5, 1])

## Further Preprocessing: 

Current dataframe:

In [None]:
train_df.head()

In [None]:
train_df.info()

So far, we have preprocessed our data set to where all columns are categorical. Besides colummns that are already assigned numerical integers, preprocessing must be done on the remaining columns: 
 - One Hot Encoding of Nominal Categories: Sex, Embarked, Title, Deck
 - Ordinal Encoding of Ordinal Categories: Ticket Number Start

**One Hot Encoding**

In [None]:
ohe = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
ohe.fit(train_df[['Sex', 'Embarked', 'Title', 'Deck']])

In [None]:
ohe.categories_

In [None]:
train_encoded = ohe.transform(train_df[['Sex', 'Embarked', 'Title', 'Deck']])
test_encoded = ohe.transform(test_df[['Sex', 'Embarked', 'Title', 'Deck']])

train_df = pd.concat([train_df, train_encoded], axis=1).drop(columns=['Sex', 'Embarked', 'Title', 'Deck'])
test_df = pd.concat([test_df, test_encoded], axis=1).drop(columns=['Sex', 'Embarked', 'Title', 'Deck'])

**Ordinal Encoding**

In [None]:
categories = [['L', '1', '2', '3', '4', '5', '6', '7', '8', '9']]
ode = OrdinalEncoder(categories=categories).set_output(transform='pandas')
ode.fit(train_df[['Ticket Number Start']])

In [None]:
ode.categories_

In [None]:
train_encoded_2 = ode.transform(train_df[['Ticket Number Start']])
test_encoded_2 = ode.transform(test_df[['Ticket Number Start']])

train_df = pd.concat([train_df, train_encoded_2], axis=1).drop(columns='Ticket Number Start')
test_df = pd.concat([test_df, test_encoded_2], axis=1).drop(columns='Ticket Number Start')

In [None]:
pd.set_option('display.max_columns', None)
train_df.head()

In [None]:
test_df.head()

In [None]:
# Shapes make sense (Recall, that test_df does not contain Survival column like train_df)
train_df.shape, test_df.shape

# 1) Model Building: Artificial Neural Networks

In [None]:
X_train, y_train = train_df.drop('Survived', axis=1).to_numpy(), to_categorical(train_df['Survived']) # NOTE: to_categorical converts y_train to one hot encoded
X_test = test_df.to_numpy()

X_train.shape, y_train.shape

In [None]:
def create_model():
    return Sequential([
        Dense(units=128, input_shape=(X_train.shape[1],), activation='relu'),
        Dropout(0.2),
        Dense(units=64, input_shape=(X_train.shape[1],), activation='relu'),
        Dropout(0.2),
        Dense(units=32, activation='relu'),
        Dropout(0.2),
        Dense(units=16, activation='relu'),
        Dropout(0.2),
        Dense(units=y_train.shape[1], activation='softmax') # NOTE: to_categorical converts y columns into one-hot encoded format
    ])

**Finding the best learning rate using `LearningRateScheduler` callback**

In [None]:
nn_model = create_model()

EPOCHS = 200
lr_schedule = LearningRateScheduler(lambda epoch: 1e-9 * 10**(epoch / 20)) # Expoentially increase learning rate by a factor of 10 after every 20 epochs
optimizer = Adam(learning_rate=1e-7)
                 
nn_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.summary()

In [None]:
history = nn_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=32, validation_split=0.2, callbacks=[lr_schedule])

In [None]:
# Plot the learning rate vs validation loss
plt.figure(figsize=(10, 6))
plt.semilogx(history.history['learning_rate'], history.history['val_loss'])
plt.xlabel('Learning Rate')
plt.ylabel('Validation Loss')
plt.title('Learning Rate vs. Validation Loss')
plt.show()

**Using optimal learning rate found to train our model**

In [None]:
nn_model = create_model()

EPOCHS = 200
early_stop = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)
optimizer = Adam(learning_rate=1e-2)
                 
nn_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.summary()

In [None]:
history = nn_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=32, validation_split=0.2, callbacks=[early_stop])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs_range = range(len(acc))

fig, ax = plt.subplots()
ax.plot(epochs_range, acc, label='Training Accuracy')
ax.plot(epochs_range, val_acc, label='Validation Accuracy')
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
ax.set_title('Training and Validation Accuracy')

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(len(loss))

fig, ax = plt.subplots()
ax.plot(epochs_range, loss, label='Training Loss')
ax.plot(epochs_range, val_loss, label='Validation Loss')
ax.legend()
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.set_title('Training and Validation Accuracy')

## ANN - Test Data Predictions

In [None]:
nn_test_pred = nn_model.predict(X_test)
nn_test_pred_encoded = (nn_test_pred > 0.5).astype(int)

nn_test_pred_decoded = []
for t in nn_test_pred_encoded:
    if t[0] == 1 and t[1] == 0: # Not Survived: [1, 0]
        nn_test_pred_decoded.append(0)
    elif t[0] == 0 and t[1] == 1: # Survived: [1, 0]
        nn_test_pred_decoded.append(1)

nn_survived = np.array(nn_test_pred_decoded)

# 2) Model Building: Decision Trees

In [None]:
X_train, y_train2 = train_df.drop('Survived', axis=1).to_numpy(), train_df['Survived'].to_numpy() # Not one-hot encoded format
X_test = test_df.to_numpy()

X_train.shape, y_train2.shape

In [None]:
dt_model = DecisionTreeClassifier()

param_grid = {
    'min_samples_split': [5, 10, 15],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random']
}

grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
dt_model = DecisionTreeClassifier(criterion=best_params['criterion'], 
                                  max_depth=best_params['max_depth'], 
                                  min_samples_leaf=best_params['min_samples_leaf'],
                                  min_samples_split=best_params['min_samples_split'],
                                  splitter=best_params['splitter'])
dt_model.fit(X_train, y_train2)
dt_survived = dt_model.predict(X_test)

# 3) Model Building: Random Forest

In [None]:
rf_model = RandomForestClassifier()

param_grid = {
    'n_estimators': [50, 100, 150],
    'min_samples_split': [5, 10],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [4, 5, 6],
    'criterion': ['gini', 'entropy'],
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
rf_model = RandomForestClassifier(criterion=best_params['criterion'], 
                                  max_depth=best_params['max_depth'], 
                                  min_samples_leaf=best_params['min_samples_leaf'], 
                                  min_samples_split=best_params['min_samples_split'], 
                                  n_estimators=best_params['n_estimators'])

rf_model.fit(X_train, y_train2)
rf_survived = rf_model.predict(X_test)

# 4) Model Building: Ada Boost

In [None]:
ab_model = AdaBoostClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10],
}


grid_search = GridSearchCV(estimator=ab_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
ab_model = AdaBoostClassifier(learning_rate=best_params['learning_rate'],
                             n_estimators=best_params['n_estimators'])

ab_model.fit(X_train, y_train2)
ab_survived = ab_model.predict(X_test)

# 5) Model Building: Gradient Boost

In [None]:
gb_model = GradientBoostingClassifier()

param_grid = {
  'n_estimators' : [300, 400, 500],
  'learning_rate': [ 0.1, 0.3, 0.6],
  'max_depth': [8, 10, 12],
  'min_samples_leaf': [50, 100, 120],
  'max_features': [0.1, 0.3, 0.5] 
}

grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
gb_model = GradientBoostingClassifier(learning_rate=best_params['learning_rate'],
                                     max_depth=best_params['max_depth'],
                                     max_features=best_params['max_features'],
                                     min_samples_leaf=best_params['min_samples_leaf'],
                                     n_estimators=best_params['n_estimators'])

gb_model.fit(X_train, y_train2)
gb_survived = gb_model.predict(X_test)

# 6) Model Building: XG Boost

In [None]:
xgb_model = XGBClassifier()

param_grid = {
     'booster': ['gbtree', 'gblinear','dart'],
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
xgb_model = XGBClassifier(booster=best_params['booster'])

xgb_model.fit(X_train, y_train2)
xgb_survived = xgb_model.predict(X_test)

# 7) Model Building: KNN

In [None]:
knn_model = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2],
}

grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train2)

In [None]:
best_params = grid_search.best_params_
best_params

In [None]:
grid_search.best_score_

In [None]:
knn_model = KNeighborsClassifier(algorithm=best_params['algorithm'],
                                n_neighbors=best_params['n_neighbors'],
                                p=best_params['p'],
                                weights=best_params['weights'])

knn_model.fit(X_train, y_train2)
knn_survived = knn_model.predict(X_test)

# Creating csv file for submission

In [None]:
def majority_votes(*classifications):
    final_votes = []
    test_len = classifications[0].shape[0] # 418
    count = 0
    
    for i in range(test_len):
        votes_count = dict()
        for c in classifications:
            votes_count[c[i]] = votes_count.get(c[i], 0) + 1
        final_votes.append(max(votes_count, key=votes_count.get))
    
    return np.array(final_votes)
            
survived = majority_votes(nn_survived, 
                           dt_survived, 
                           rf_survived, 
                           ab_survived, 
                           gb_survived, 
                           xgb_survived, 
                           knn_survived)

In [None]:
passenger_id = pd.read_csv(files[1])['PassengerId'].to_numpy()

submission = pd.DataFrame({
    'PassengerId': passenger_id,
    'Survived': survived
})

submission

In [None]:
submission.to_csv(f"/kaggle/working/submission_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.csv", index=False)
print(f"Generated file /kaggle/working/submission_{datetime.now().strftime('%Y-%m-%d_%H%M%S')}.csv")

# Sources
 - https://towardsdatascience.com/guide-to-encoding-categorical-features-using-scikit-learn-for-machine-learning-5048997a5c79
 - https://annahava.medium.com/too-many-categories-how-to-deal-with-categorical-features-of-high-cardinality-d4563cfe62d6#:~:text=One%20of%20the%20most%20common,model%20and%20get%20modest%20results.
 - https://www.youtube.com/watch?v=6IGx7ZZdS74
 - https://www.ultravioletanalytics.com/blog/kaggle-titanic-competition-part-iv-derived-variables/
 - https://www.kaggle.com/code/ccastleberry/titanic-cabin-features