In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import pandas as pd
import numpy as np
import regex as re

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        
train_data = pd.read_csv(paths[0])
test_data = pd.read_csv(paths[1])
sample_submission = pd.read_csv(paths[2])
all_data = pd.concat([test_data.assign(Train=False), train_data.assign(Train=True)])

In [3]:
missing_features = [col for col in train_data.columns if col not in test_data.columns]
missing_features

['Survived']

# Go through each column 1 by 1, transforming or cleaning or whatever

### PassengerId
- Drop Id column as it is not relevant
- We'll need to keep a copy of the test ids for the submissions later though

In [4]:
# Keep a copy
PassengerId = test_data.PassengerId

# Drop ID
if 'PassengerId' in all_data.columns:
    all_data.drop(['PassengerId'], axis=1, inplace=True)

### Pclass
- No missing values
- Possible value 1, 2, or 3

### Name
- Probably useless
- We could extract title as new feature
- Number of middle names / surnames or name length could be useful\
- No missing values


In [5]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

all_data['Title'] = all_data['Name'].map(get_title)
all_data['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Ms            2
Major         2
Mlle          2
Sir           1
Countess      1
Capt          1
Don           1
Lady          1
Mme           1
Dona          1
Jonkheer      1
Name: Title, dtype: int64

In [6]:
def simple_title(title):
    if type(title) == str:
        if title in ["Ms", "Mile", "Miss"]:
            return "Miss"
        elif title in ["Mme", "Mrs"]:
            return "Mrs"
        elif title == "Mr":
            return "Mr"
        elif title == "Master":
            return "Master"
        else:
            return "Other"
    else:
        return title
    
all_data['Title'] = all_data['Title'].map(simple_title)

# Convert title to categorical
all_data['Title'] =  all_data['Title'].astype('category').cat.codes

# Create new feature for length of name
all_data['Name_length'] = all_data['Name'].map(len)

In [7]:
if 'Name' in all_data.columns:
    all_data.drop(['Name'], axis=1, inplace=True)

### Ticket
- Not sure, no clear pattern or meaning
- Lets justy drop it XD

In [8]:
if 'Ticket' in all_data.columns:
    all_data.drop(['Ticket'], axis=1, inplace=True)

### Sex
- All values 'male' or 'female'
- Convert to int

In [9]:
all_data['Sex'] = all_data['Sex'].map({'male': 0, 'female': 1}).astype(int)

### Age
- There are 263 missing values here
- Maybe we could convert to age categories (baby, toddler, child, teen, young adult..., middle-aged, ... uknown)
- The missing values dont seem to be correlated with any other attributes
- Age range is from 0.17 to 80

In [10]:
all_data['Age'] = pd.cut(all_data['Age'], 5, labels=["child", "young adult", "adult", "old", "oldest"])
all_data['Age'].cat.add_categories(['unknown'], inplace=True)
all_data.loc[all_data['Age'].isna(), 'Age'] = 'unknown'

  res = method(*args, **kwargs)


### Cabin
- Many missing values (77%) - is a missing value equivalent to not having a cabin ?
- Some passengers actually appear to have more than 1 cabin also
- We could create a new feature 'Cabins', indicating the number of cabins
- Lets drop the original variable

In [11]:
def get_n_cabins(cabin):
    if type(cabin) == str:
        cabins = cabin.strip().split(" ")
        return len(cabins)
    else:
        return 0
    
    
all_data['Cabins'] = all_data['Cabin'].map(get_n_cabins)

if 'Cabin' in all_data.columns:
    all_data.drop(['Cabin'], axis=1, inplace=True)

### Fare
- Has only 1 missing value
- We could try and guess those missing values based on another feature maybe
- An old single guy with no cabin in Pclass 3, probably a low fare
- We can see the mean fare for this type of passenger is 7.65282
- We could try splitting into fare categories in a later version to see if this improves predictions

In [12]:
all_data.loc[all_data['Fare'].isna(), 'Fare'] = 7.65282

all_data['Fare'] = all_data['Fare'].astype(int)

### Pclass
- No missing values
- Pclass is 1, 2, or 3
- High correlation with fare variable and number of cabins but the correlation is negative weirdly
- Lets reverse the order of Pclass from 3, 2, 1 to 1, 2, 3 (not sure if this will make a difference ?)

In [13]:
# Be careful not to run this code an odd number of times!! XD
all_data['Pclass'].replace({3: 1, 1: 3}, inplace=True)

### Sibsp & Parch
- The documentation says this is the number of siblings and the number of parents onboard the titanic respectively
- No missing values in each
- Looking at a correlation matrix shows that all other features are roughly correlated to 'SibSp' by + or - 0.1% how much they are correlated to 'Parch'
- We could maybe combine this into one variable, a future version could treat them differently to see if it makes any differrence

In [14]:
all_data['GroupSize'] = all_data['SibSp'] + all_data['Parch']

In [15]:
if 'SibSp' in all_data.columns:
    all_data.drop(['SibSp'], axis=1, inplace=True)

if 'Parch' in all_data.columns:
    all_data.drop(['Parch'], axis=1, inplace=True)

### Embarked
- Possible values are 'S' 'C' or 'Q' which indicates the port they embarked from
- Two missing values
- I wouldnt imagine this makes much difference to whether or not they survived but lets check a correlation matrix on the train data
- Nope, lets drop it... oops that might of been a bad idea, oh well we can always change in a later version

In [16]:
if 'Embarked' in all_data.columns:
    all_data.drop(['Embarked'], axis=1, inplace=True)

# Ok, so apart from the Train column we created to distinguish between test or train data, that seems to be all of the existing features prepped 
 # 🏁  🏁  🏁


 ## Ok well now lets encode the categorical variables to numeric
 - Luckily we only have to deal with Age

In [17]:
all_data['Age'] = all_data['Age'].cat.codes

### Ok, now lets split back out to train and test sets...

In [18]:
train_data = all_data.loc[all_data.Train == True]

test_data = all_data.loc[all_data.Train == False]

if 'Train' in train_data.columns:
    train_data.drop(['Train'], axis=1, inplace=True)
    
if 'Train' in test_data.columns:
    test_data.drop(['Train'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Split training and test data into X and Y sets

In [19]:
X_train = train_data.drop("Survived", axis=1)
Y_train = train_data.Survived

X_test = test_data.copy().drop("Survived", axis=1)

## Lets start building the model!!

#### First scale all the features, since they are all numeric

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# For version 1 of this notebook, lets only consider classifier models

### Given several types of classifier models, we can now use cross validation on the training data to find the best classifier model

In [21]:
classifiers = {
    "KNN": KNeighborsClassifier(), 
    "LR": LogisticRegression(max_iter=1000), 
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC(),
    "MLP": MLPClassifier(max_iter=1000),
    "XGB": XGBClassifier(),
    "LGBM": LGBMClassifier()
}

results = dict()
for name, clf in classifiers.items():
    model = clf
    cv_results = cross_validate(
        model, X_train_scaled, Y_train, cv=5,
        scoring=('accuracy')
    )

    results[name] = cv_results['test_score'].mean()
    
from collections import Counter
Counter(results).most_common()



[('SVM', 0.8237775406440274),
 ('RF', 0.8125729709371665),
 ('LGBM', 0.8047329106772958),
 ('MLP', 0.8024731655263324),
 ('XGB', 0.8013495700207145),
 ('KNN', 0.796899127487289),
 ('LR', 0.7934906785512522),
 ('DT', 0.766580880045195)]

### We can see that the SVC (support vector classifier) model has the best score, so lets use it!

#### A cross validation grid search for the best hyperparameters for this model

In [22]:
svc = SVC(random_state=0)
params = {
    "kernel": ['linear', 'rbf', 'poly'],
    #"gamma": [0.1, 1, 10, 100], # I would comment these two hyperparemters out
    #"C": [0.1, 1, 10, 100, 1000], # because otherewise it takes too long to run
    "degree": [0, 1, 2, 3, 4, 5, 6] 
}
clf = GridSearchCV(svc, params, cv=10, verbose=2)
clf.fit(X_train_scaled, Y_train)
print("Best hyperparameter:", clf.best_params_)

Fitting 10 folds for each of 21 candidates, totalling 210 fits
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ............................degree=0, kernel=linear; total time=   0.0s
[CV] END ...............................degree=0, kernel=rbf; total time=   0.0s
[CV] END ...............................degree

# Lets submit ✌️ 🤓 🥳

In [23]:
Y_pred = clf.predict(X_test_scaled)

submission = pd.DataFrame({
         "PassengerId": PassengerId,
         "Survived": Y_pred.astype(int)
})
submission.to_csv('submission.csv', index=False)