In [195]:
#### Dependencies ####
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [196]:
#### Params ####
seed = 42 # for reproducibility

In [197]:
#### Loading datasets ####
train_set = pd.read_csv('Data/train.csv')
test_set = pd.read_csv('Data/test.csv')

In [198]:
#### Removing useless columns ####
def drop(dataset,columns):
    return dataset.drop(columns,axis=1)

columns_to_drop_train = ['PassengerId','Name','Ticket','Fare','Cabin']
train_set = drop(train_set,columns_to_drop_train)

columns_to_drop_test = columns_to_drop_train
test_set = drop(test_set,columns_to_drop_test)

In [199]:
train_set.info()
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Embarked  889 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 48.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Embarked  418 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 19.7+ KB


In [200]:
#### Handling Missing Values ####
### Embarked : Nan values for train_set
train_set['Embarked'] = train_set['Embarked'].fillna(train_set.Embarked.mode().item())

### Age : Nan values in the training and test sets, let's perform median impute using group of population (sex and class might be enough)
train_set['Age'] = train_set.groupby(['Pclass', 'Sex']).Age.transform(lambda x: x.fillna(x.median()))
test_set['Age'] = test_set.groupby(['Pclass', 'Sex']).Age.transform(lambda x: x.fillna(x.median()))

In [201]:
#### Encoding Categorical Variables ####
### Label encoding, no specific order
def label_encode(dataset,columns):
    for column in columns:
        dataset[column] = pd.Categorical(dataset[column]).codes
    return dataset

columns_to_label_encode_train = ['Survived','Sex','Embarked','Parch','SibSp']
train_set = label_encode(train_set,columns_to_label_encode_train)
columns_to_label_encode_test = columns_to_label_encode_train[1:]
test_set = label_encode(test_set,columns_to_label_encode_test)

### Ordinal encoding
train_set.Pclass -= 1
test_set.Pclass -= 1

In [202]:
### Observing changes
train_set.head(100)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,2,1,22.0,1,0,2
1,1,0,0,38.0,1,0,0
2,1,2,0,26.0,0,0,2
3,1,0,0,35.0,1,0,2
4,0,2,1,35.0,0,0,2
...,...,...,...,...,...,...,...
95,0,2,1,25.0,0,0,2
96,0,0,1,71.0,0,0,0
97,1,0,1,23.0,0,1,0
98,1,1,0,34.0,0,1,2


In [203]:
#### Scaling data ####
def scale_data(dataset, features):
    scaler = StandardScaler()
    dataset[features] = scaler.fit_transform(dataset[features])
    return dataset

train_set = scale_data(train_set, ["Age"])
test_set = scale_data(test_set, ["Age"])

In [204]:
train_set.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,2,1,-0.534891,1,0,2
1,1,0,0,0.668392,1,0,0
2,1,2,0,-0.23407,0,0,2
3,1,0,0,0.442776,1,0,2
4,0,2,1,0.442776,0,0,2


In [205]:
#### Model Fitting ####
### Working with Stratified KFold to avoid overfitting
skf = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=seed)

X = train_set.drop(columns='Survived', axis=1).to_numpy()
y = train_set['Survived'].to_numpy()
X_submission = test_set.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

### Performing GridSerachCV to find the best hyperparameters
param_grid = {'max_depth': range(1,10), 
              'n_estimators': range(1,20), 
              'max_features': ['sqrt', 'log2'], 
              'random_state': [seed], 
              'warm_start': [True]}

### Fitting the model
grid = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, n_jobs=-1, cv=skf, verbose=1).fit(X_train, y_train)

print('CV Score: {}'.format(grid.best_score_))
print('Test score:', grid.score(X_test, y_test))
print('Best params:\n{}'.format(grid.best_params_))

Fitting 9 folds for each of 342 candidates, totalling 3078 fits
CV Score: 0.8363385807331996
Test score: 0.7982062780269058
Best params:
{'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 16, 'random_state': 42, 'warm_start': True}


In [206]:
### Making predictions
predictions = grid.predict(X_submission)
pd.DataFrame({'PassengerId': np.arange(892, 1310, 1), 'Survived': predictions}).to_csv('submission.csv', index=0)