# Titanic - Machine Learning from Disaster

It is the [Titanic](https://www.kaggle.com/c/titanic/) competition from Kaggle. Download all the data from kaggle and put it in <i>titanic</i> folder.

This notebook uses [hsi_env](../environments/hsi_env.yml) for running. Take a look in [README](../environments/README.md) for details.


## 1. Configure and download the dataset from Kaggle

Look into [kaggle-api](https://github.com/Kaggle/kaggle-api) for details

In [None]:
# ! E:/ProgramData/Anaconda3/envs/hsi_env/Scripts/pip install kaggle

In [None]:
# ! kaggle competitions download -c titanic
# ! unzip titanic.zip -d titanic
# ! del titanic.zip

## 2. Read from CSV

In [None]:
import pandas as pd
import numpy as np

In [None]:
training = pd.read_csv("titanic/train.csv")
# Survived column is not at the end
training['Survived'] = training.pop('Survived')
training.head()

### 2.1 Remove NaN

In [None]:
age_mean = 0
fare_mean = 0

In [None]:
def remove_nan(X, train=True):
    global age_mean, fare_mean
    
    if train:
        age_mean = X["Age"].mean()
        fare_mean = X["Fare"].mean()

    X["Age"] = X["Age"].fillna(age_mean)
    X["Fare"] = X["Fare"].fillna(fare_mean)
    
    X["Cabin"] = X["Cabin"].fillna("NaN")
    X["Embarked"] = X["Embarked"].fillna("NaN")

    return X

In [None]:
training = remove_nan(training)

In [None]:
training[training.isnull().any(axis=1)].head()

## 3. Classification

In [None]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X, y = training.iloc[:, :-1],training.iloc[:, -1]

### 3.1 Encoding data

[How to handle categorical data in scikit with pandas](https://www.kaggle.com/getting-started/27270)

In [None]:
titan_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
norm_sc = RobustScaler()

In [None]:
def encode_data(X, train=True):
    global titan_oe, norm_sc

    X['Male'] = X['Sex'].map( {'male':0, 'female':1} )

    if train:
        titan_oe.fit( X[ ["Cabin","Embarked"] ] )
        norm_sc.fit( X[ ["Age", "Fare"] ] )

    X[ ["Cabin", "Embarked"] ] = titan_oe.transform( X[ ["Cabin","Embarked"] ] )
    X[ ["Age", "Fare"] ] = norm_sc.transform( X[ ["Age", "Fare"] ] )

    X = X.drop(columns=["PassengerId", "Name", "Ticket", "Sex"])

    return X

In [None]:
X = encode_data(X)
X.head()

### 3.2 Principal Component Analysis(PCA)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA

In [None]:
sns.set_theme()

In [None]:
# pca = PCA(n_components = 8) 

# principalComponents = pca.fit_transform(X.values)
# ev=pca.explained_variance_ratio_

# plt.figure(figsize=(12, 6))
# plt.plot(np.cumsum(ev))
# plt.xlabel('Number of components')
# plt.ylabel('Cumulative explained variance')
# plt.grid(True, alpha=0.5)

# plt.show()

In [None]:
n_com = 8  # CEV is saturated
pca = PCA(n_components=n_com)

X_pca = pca.fit_transform(X.values)

### 3.4 Check all models

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV, GridSearchCV

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#### Ensemble Methods

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.2)

### 3.4.1 SVM

In [None]:
# rbf_svc = SVC(kernel='rbf', cache_size=1024, verbose=True, max_iter=10000)
# rbf_svc.fit(X_train, y_train)

# y_pred = rbf_svc.predict(X_test)
# print(classification_report(y_test, y_pred, target_names = ["Dead", "Alive"]))

### 3.4.2 LR

In [None]:
# lr = LogisticRegression(n_jobs=-1, max_iter=10000)
# lr.fit(X_train, y_train)

# y_pred = lr.predict(X_test)
# print(classification_report(y_test, y_pred, target_names = ["Dead", "Alive"]))

### 3.4.2 Ada

In [None]:
# ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier() )

# params = {
#     'n_estimators': [int(x) for x in np.linspace(start = 80, stop = 200, num = 10)],
#     'learning_rate': [0.01, 0.1, 0.5, 1],
#     'base_estimator__criterion': ['gini', 'entropy'],
#     'max_depth': [5, 9, 14, 19],
#     'min_samples_split': [3, 5, 7],
#     'base_estimator__min_samples_leaf': [1, 3, 5],
#     'base_estimator__splitter': ['best', 'random'],
#     'base_estimator__max_features': [None, 'auto', 'log2'],
#     'base_estimator__max_leaf_nodes': [None, 5, 9],
#     'base_estimator__class_weight': [None, 'balanced']
# }

# gs = HalvingRandomSearchCV(
#     ada, param_distributions=params, scoring='accuracy', verbose=1
# )

# gs.fit(X_pca, y)
# gs.best_estimator_, gs.best_params_, gs.best_score_

### 3.4.4 Calibrated 

In [None]:
# calibrated_forest = CalibratedClassifierCV(
#     base_estimator=RandomForestClassifier(n_jobs=-1), 
#     cv=5, n_jobs=-1)

# # RandomForestClassifier().get_params().keys()
# params = {
#     'method': ['sigmoid', 'isotonic'],
#     'ensemble': [True, False],
#     'base_estimator__n_estimators': [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)],
#     'base_estimator__criterion': ['gini', 'entropy'],
#     'base_estimator__max_depth': [None, 5, 10, 15, 20],
#     'base_estimator__min_samples_split': [0.1, 0.3, 0.5, 3, 5, 7],
#     'base_estimator__min_samples_leaf': [0.1, 0.3, 0.5, 1, 3, 5],
#     'base_estimator__max_features': ['sqrt', 'auto', 'log2'],
#     'base_estimator__max_leaf_nodes': [None, 5, 9],
#     'base_estimator__class_weight': [None, 'balanced', 'balanced_subsample']
# }

# gs = HalvingRandomSearchCV(
#     calibrated_forest, param_distributions=params, scoring='accuracy', verbose=1
# )

# gs.fit(X, y)
# gs.best_estimator_, gs.best_params_, gs.best_score_

### 3.4.5 RF

In [None]:
# rf =RandomForestClassifier(n_jobs=-1)

# # RandomForestClassifier().get_params().keys()
# params = {
#     'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 200, num = 10)],
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 5, 10, 15, 20],
#     'min_samples_split': [0.1, 0.3, 0.5, 3, 5, 7],
#     'min_samples_leaf': [0.1, 0.3, 0.5, 1, 3, 5],
#     'max_features': ['sqrt', 'auto', 'log2'],
#     'max_leaf_nodes': [None, 5, 9],
#     'class_weight': [None, 'balanced', 'balanced_subsample']
# }

# gs = HalvingRandomSearchCV(
#     rf, param_distributions=params, scoring='accuracy', verbose=1
# )

# gs.fit(X, y)
# gs.best_estimator_, gs.best_params_, gs.best_score_

### 3.3 K-Fold

In [None]:
def evaluate_all_models(models, X, y):
	# evaluate each model in turn
	seed = 42
	results = []
	names = []

	scoring = 'accuracy'
	k_fold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)

	for name, model in models:
		cv_results = model_selection.cross_val_score(model, X, y, cv=k_fold, scoring=scoring)
		results.append(cv_results)
		names.append(name)
		msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
		print(msg)
	return names, results

In [None]:
def plot_comparison(names, results):
    # boxplot algorithm comparison

    fig = plt.figure(figsize = ( int(1.2*len(names) ), 8 ) )

    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)

    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()

In [None]:
models = []
models.append(('LR', LogisticRegression(n_jobs=-1, max_iter=10000) ) )
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append( ('SVM', SVC(cache_size=1024, max_iter=10000) ) )
models.append( ('SVM_R', SVC(kernel='rbf', cache_size=1024, max_iter=10000) ) )

models.append( ('BAG_LR', BaggingClassifier(
    LogisticRegression(n_jobs=-1, max_iter=10000), 
    n_estimators=100, max_samples=0.5, max_features=0.5) ) )

models.append( ('BAG_LD', BaggingClassifier(
    LinearDiscriminantAnalysis(), 
    n_estimators=100, max_samples=0.5, max_features=0.5) ) )

models.append( ('AB_LD', AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(class_weight='balanced',
        max_depth=9, max_features='auto', min_samples_leaf=3, min_samples_split=3),
    learning_rate=1, n_estimators=133) ) )    
    
models.append( ('ET', ExtraTreesClassifier(n_estimators=100, n_jobs=-1, random_state=0) ) )
models.append( ('RF1', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0) ) )

models.append( ('RF2', RandomForestClassifier(
    class_weight='balanced', criterion='entropy', max_depth=5, max_features='sqrt', 
    min_samples_leaf=3, min_samples_split=3, n_estimators=178, n_jobs=-1, random_state=0) ) )

models.append( ('CF', CalibratedClassifierCV(
    base_estimator=RandomForestClassifier(
        class_weight='balanced_subsample', criterion='entropy', max_depth=10,
        max_features='log2', max_leaf_nodes=9, min_samples_split=5, 
        n_estimators=178, n_jobs=-1, random_state=0),
    cv=5, method='isotonic', n_jobs=-1) ) )

### 3.3.1 PCA

In [None]:
names, results = evaluate_all_models(models, X_pca, y)
plot_comparison(names, results)

### 3.3.2 All data

In [None]:
names, results = evaluate_all_models(models, X, y)
plot_comparison(names, results)

## 4. Submission 

In [None]:
testing = pd.read_csv("titanic/test.csv")
df = testing.filter(["PassengerId"], axis=1)

testing.head()

In [None]:
testing = remove_nan(testing, train=False)
testing[testing.isnull().any(axis=1)].head()

In [None]:
testing = encode_data(testing, train=False)
testing.head()

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
rf.fit(X, y)

In [None]:
cf = CalibratedClassifierCV(
    base_estimator=RandomForestClassifier(
        class_weight='balanced_subsample', criterion='entropy', max_depth=10,
        max_features='log2', max_leaf_nodes=9, min_samples_split=5, 
        n_estimators=178, n_jobs=-1, random_state=0),
    cv=5, method='isotonic', n_jobs=-1)

cf.fit(X_pca, y)

In [None]:
df["Survived"] = rf.predict(testing)

# t_pca = pca.transform( testing.values )
# df["Survived"] = cf.predict( t_pca )

df.to_csv("titanic/submission.csv", index=False)
df.sample(5)

In [None]:
# ! kaggle competitions submit -c titanic -f titanic/submission.csv -m "RandomForestClassifier"