# Titanic - Machine Learning from Disaster

It is the [Titanic](https://www.kaggle.com/c/titanic/) competition from Kaggle. Download all the data from kaggle and put it in <i>titanic</i> folder.

This notebook uses [tfgpuenv](../environments/tfgpuenv.yml) for running. Take a look in [README](../environments/README.md) for details.


## 2. Read from CSV

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import seaborn as sns

%matplotlib inline
sns.set_theme()

In [None]:
training = pd.read_csv("titanic/train.csv")
# Survived column is not at the end
training['Survived'] = training.pop('Survived')
training.head()

### 2.2 Encoding data

[How to handle categorical data in scikit with pandas](https://www.kaggle.com/getting-started/27270)

In [None]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler

In [None]:
titan_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# https://www.geeksforgeeks.org/standardscaler-minmaxscaler-and-robustscaler-techniques-ml/
titan_sc = MinMaxScaler(feature_range = (0,1))

In [None]:
def encode_data(X, train=True):
    global titan_oe, norm_sc

    def is_alone(a, b):
        if a + b == 0:
            return 1
        return 0
    
    def is_minor(s, a):
        if a < 17:
            return 0
        elif s == "female":
            return 1
        else: return 2


    X['is_m'] = X.apply(lambda x: is_minor(x.Sex, x.Age), axis=1)
    X['is_a'] = X.apply(lambda x: is_alone(x.SibSp, x.Parch), axis=1)

    if train:
        titan_sc.fit(X[["Age", "Fare"]])
    X[["Age", "Fare"]] = titan_sc.transform(X[["Age", "Fare"]])
    
    if train:
        titan_oe.fit(X[["Sex", "Embarked"]])
    X[["Sex", "Embarked"]] = titan_oe.transform(X[["Sex", "Embarked"]])
    X["Embarked"] = X["Embarked"] + 1
    
    
    X = X.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
    
    # if train:
    #     titan_sc.fit(X)
    # return pd.DataFrame(data=titan_sc.transform(X), columns=X.columns)

    return X

In [None]:
e_X = encode_data(training.copy().iloc[:, :-1])

e_X['Survived'] = training['Survived']
e_X.head()

In [None]:
# profile = ProfileReport(e_X)
# profile

### 2.3 Remove NaN

In [None]:
e_X.info()

In [None]:
print('Amount of missing values in each column: ')
e_X.isnull().sum()

In [None]:
age_mean_a = 0
age_mean = 0
fare_mean = 0

In [None]:
def remove_nan(X, train=True):
    global age_mean, age_mean_a, fare_mean
    
    if train:
        age_mean_a = X[(X["is_a"]==1)]["Age"].mean()
        age_mean = X[(X["is_a"]==0)]["Age"].mean()
    
    mask = X["Age"].isna()
    X.loc[mask, "Age"] = np.where(X.loc[mask, "is_a"].eq(1), age_mean_a, age_mean)
    
    if train:
        fare_mean = X["Fare"].mean()
    X["Fare"].fillna(fare_mean, inplace=True)
    
    X["Embarked"].fillna(0, inplace=True)
    X[["Sex", "Embarked"]] = X[["Sex", "Embarked"]].astype(int)
    # X[["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "is_m", "is_a", "Survived"]] = X[["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "is_m", "is_a", "Survived"]].astype(int)

    return X

In [None]:
e_X = remove_nan(e_X)
e_X[e_X.isnull().any(axis=1)].head()

In [None]:
# (e_X < 0).any()

In [None]:
# profile = ProfileReport(e_X)
# profile

### 2.4 data, class division

In [None]:
X, y = e_X.iloc[:, :-1], e_X.iloc[:, -1]

## 3. Classification

### 3.2 Principal Component Analysis(PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = len(X.columns)) 

principalComponents = pca.fit_transform(X.values)
ev=pca.explained_variance_ratio_

plt.figure(figsize=(12, 6))
plt.plot(np.cumsum(np.append([0], ev)))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.grid(True, alpha=0.5)

plt.show()

In [None]:
n_com = 6 # len(X.columns)
pca = PCA(n_components=n_com)

X_pca = pca.fit_transform(X.values)

### 3.4 Check all models

In [None]:
# import sys
# print(sys.executable)
# print(sys.version)
# print(sys.version_info)

In [None]:
# ! conda install -c anaconda tensorflow-gpu
# ! conda list tensor

In [None]:
# # https://stackoverflow.com/a/66027093/2049763
# ! pip uninstall -y tensorflow-estimator
# ! pip uninstall -y tensorboard

# ! conda install scipy=1.4.1 
# ! conda install -y -c anaconda tensorflow-estimator=2.6
# ! conda install -y -c conda-forge tensorboard=2.6

In [None]:
import tensorflow as tf
# import tensorflow_addons as tfa
from tensorflow import keras

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

tf.__version__

In [None]:
print(X.columns)
X_new = X[['Pclass', 'Sex', 'Age', 'Fare', 'is_m', 'is_a']]

In [None]:
# Model architecture
model_m = Sequential([
    Dense(units = 16, input_shape= (len(X_new.columns), ), activation = 'relu'),
    Dropout(rate = 0.1),
    Dense(units = 32, activation = 'relu'),
    Dropout(rate = 0.1),
    Dense(units = 64, activation = 'relu'),
    Dropout(rate = 0.1),
    Dense(units = 8, activation = 'relu'),
    Dropout(rate = 0.1),
    Dense(units = 2, activation = 'sigmoid')  # softmax 
])
model_m.summary()

# Model compilation
model_m.compile(optimizer=Adam(learning_rate = 0.001),  # SGD(lr=0.01, momentum=0.95) 
              loss = 'sparse_categorical_crossentropy', 
              metrics = ['accuracy'] 
             )

es = EarlyStopping(monitor='loss', mode='auto', verbose=1, patience=5)
# tqdm_callback = tfa.callbacks.TQDMProgressBar()

# Model Training and Validation
model_m.fit(x = X_new.values, y = y.values, 
          batch_size= 8, epochs = 50, 
        #   validation_split= 0.20, 
          shuffle = True, verbose = 1, callbacks=[es] )

## 4. Submission 

In [None]:
testing = pd.read_csv("titanic/test.csv")
df = testing.filter(["PassengerId"], axis=1)

testing.head()

In [None]:
testing = encode_data(testing, train=False)
testing.head()

In [None]:
testing = remove_nan(testing, train=False)
testing[testing.isnull().any(axis=1)].head()

In [None]:
testing.head()

### 4.1 Prediction

In [None]:
XT_new = testing[['Pclass', 'Sex', 'Age', 'Fare', 'is_m', 'is_a']]

In [None]:
y_p = model_m.predict(x=XT_new.values, batch_size=8, verbose=0)
df["Survived"] = np.argmax(y_p, axis=1)

df.head()

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score

In [None]:
y_p1 = model_m.predict(x=X_new.values, batch_size=8, verbose=0)
y_p1 = np.argmax(y_p1, axis=1)

# y_p2 = model_m.predict(x=XT_new.values, batch_size=8, verbose=0)
y_p2 = np.argmax(y_p, axis=1)

for f, r in zip(["titanic/train.csv", "titanic/submission.csv"], [y_p1, y_p2] ):
    result = pd.read_csv(f)
    # result.head()
    # result.groupby(["Survived", "Prediction"]).size()

    report = classification_report(result["Survived"].values, r)
    print(report)

In [None]:
# df.to_csv("titanic/submission.csv", index=False)
# ! kaggle competitions submit -c titanic -f titanic/submission.csv -m "Keras DNN"