# 1. Loading Data

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
df_train.describe()

# 2. Variable Descriptions

In [None]:
df_train.info()

In [None]:
df_train["Pclass"] = df_train["Pclass"].astype(object)

Categorical Variable: Survived, Sex, Pclass, Embarked, Cabin, Name, Ticket

Numerical Variable: PassengerId, Age, SibSp, Parch, Fare

# 3. Missing Values

In [None]:
print("Rows and Columns:", df_train.shape)
print("NA's:")
print(df_train.isna().sum())

### 3.1. Age

In [None]:
sns.boxplot(x=df_train['Age'])

Fill in NA values of Age with mean

In [None]:
df_train['Age'] = round(df_train['Age'])
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].mean())
df_train['Age'] = df_train.Age.astype(int)

### 3.2. Cabin

Drop variable 'Cabin' because it has more than 75% of NA values

In [None]:
df_train = df_train.drop(['Cabin'], 1)

### Embarked

In [None]:
Count_Emb = df_train['Embarked'].value_counts()

plt.bar(Count_Emb.index, Count_Emb)
plt.show()

Replace 'Embarked' NAs with the majority class

In [None]:
df_train['Embarked'] = df_train['Embarked'].fillna('S')

# 4. Correlation between features

In [None]:
df_train.corr()

In [None]:
sns.heatmap(df_train.corr(), annot=True, linewidths =.5, fmt ='.1f')
plt.show()

The highest correlation is between Parch/SibSp with 0.4 and Fare/Survived with 0.3

# 5. Feature Engineering

### 5.1. Sex

Replace male class by 1 and female class by 0

In [None]:
df_train['Sex'] = df_train['Sex'].replace({'male':1, 'female':0})
df_train['Sex'] = df_train.Sex.astype(object)

df_train = pd.get_dummies(df_train, columns = ["Sex"])

In [None]:
df_train.head()

### 5.2. Age

Turning 'Age' into categories

In [None]:
plt.hist(df_train['Age'])
plt.show()

In [None]:
bins= [0, 10, 18, 30, 60, 120]
labels = ['Child','Teen','Young Adult','Adult','Old']

df_train['AgeGroup'] = pd.cut(df_train['Age'], bins=bins, labels=labels, right=False)

In [None]:
df_train = df_train.drop(['Age'], axis=1)
df_train = pd.get_dummies(df_train, columns = ["AgeGroup"])

In [None]:
df_train.head()

### 5.3. Pclass

In [None]:
df_train = pd.get_dummies(df_train, columns = ["Pclass"])

In [None]:
df_train.head()

### 5.4. SibSp and Parch

Create new variable 'Family' by adding up the variables 'SibSp' and 'Parch'

In [None]:
df_train['Family'] = df_train['SibSp'] + df_train['Parch'] + 1

In [None]:
df_train = df_train.drop(['SibSp', 'Parch'], axis=1)

In [None]:
df_train.head()

### 5.5. Embarked

In [None]:
df_train = pd.get_dummies(df_train, columns = ["Embarked"])

In [None]:
df_train.head()

### 5.6. Dropping PassengerId, Name and Ticket

In [None]:
df_train = df_train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

### 5.7. Final Dataset

In [None]:
df_train.info()

In [None]:
df_train.head()

# 6. Modelling

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance

In [None]:
X = df_train.drop(['Survived'], 1)
y = df_train['Survived']

Split the dataset into training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Standardization of all features

In [None]:
scale = StandardScaler().fit(X_train)
    
# transform the training data column
X_train = scale.transform(X_train)
    
# transform the testing data column
X_test = scale.transform(X_test)

Test 5 models to see which one performs best

In [None]:
classifier = [DecisionTreeClassifier(random_state = 0),
              RandomForestClassifier(random_state = 0),
              SVC(random_state = 0),
              MLPClassifier(random_state = 0),
              xgb.XGBClassifier(random_state = 0)]


train_results = []
test_results = []

In [None]:
for i in range(len(classifier)): 
    clf = classifier[i]
    clf.fit(X_train, y_train)
    pred_train = clf.predict(X_train)
    pred_test = clf.predict(X_test)
    acc_train = accuracy_score(y_train, pred_train)
    acc_test = accuracy_score(y_test, pred_test)
    train_results.append(acc_train)
    test_results.append(acc_test)

In [None]:
train_results

#DTree
#RForest
#SVC
#NN
#XGBoost

In [None]:
test_results

#DTree
#RForest
#SVC
#NN
#XGBoost

### Results (accuracy):
* DT = 0.8491620111731844
* RF = 0.8715083798882681
* SVC = 0.8435754189944135
* NN = 0.8268156424581006
* XGB = 0.8659217877094972

## 6.1. Hyperparameter Tuning

Hyperparameter tuning on the 2 best models chosen above, Random Forest and XGBoost

### Random Forest

In [None]:
rf = RandomForestClassifier(random_state = 0)

rf_param_grid = {'criterion': ['gini', 'entropy'],
                 'n_estimators': range(0, 200, 50),
                 'max_depth': [None, 1, 5, 10, 15, 20, 30, 40],
                 'min_samples_split' : [None, 2, 5, 10, 30],
                 'min_samples_leaf' : [None, 1, 5, 10, 30]}

rf = GridSearchCV(rf, rf_param_grid, scoring="accuracy", n_jobs=-1, verbose=1)
rf.fit(X_train, y_train)

In [None]:
rf.best_estimator_

In [None]:
rf.best_score_

* Accuracy = 0.8300009849305624

### XGBoost

In [None]:
xgb = xgb.XGBClassifier(random_state = 0)

xgb_param_grid = {'max_depth': [None, 1, 3, 5, 10],
                  'min_child_weight': [0.0001, 0.001, 0.01, 0.1],
                  'gamma': np.arange(0.0, 40.0, 10.0),
                  'learning_rate': np.arange(0, 0.5, 0.05),
                  'colsample_bylevel': np.round(np.arange(0, 2, 0.5)),
                  'colsample_bytree': np.arange(0, 2, 0.5)}

xgb = GridSearchCV(xgb, xgb_param_grid, scoring="accuracy", n_jobs=-1, verbose=1)
xgb.fit(X_train, y_train)

In [None]:
xgb.best_estimator_

In [None]:
xgb.best_score_

* Accuracy = 0.8342164877376146

# 6.2. Feature Importance

Analyze the feature importance in the xgboost model

In [None]:
features = list(X.columns)

# perform permutation importance
results = permutation_importance(xgb, X_train, y_train, scoring='accuracy', random_state=0)

# get importance
importance = results.importances_mean

# summarize feature importance
for i,v in enumerate(importance):
    print('%0d %s / Score: %.5f' % (i, features[i], v))

In [None]:
# plot feature importance
figure(figsize=(10,5))
plt.bar([x for x in range(len(importance))], importance)
plt.show()

## 6.2.1. Iteration with most importante features

Select the most important features and run the model one last time to see if it has improved performance

In [None]:
#Select all features with score > 'See graph above to select threshold'
features = list(X.columns)
df_fi = []
for i,v in enumerate(importance):
    if v > 0.01: #threshold selected
        df_fi.append(features[i])
        
df_fi.append('Survived')
df_fi

In [None]:
df_train_fi = df_train[df_fi]
df_train_fi

In [None]:
X_fi = df_train_fi.iloc[:,0:-1]
y_fi = df_train_fi.iloc[:,-1]

X_train_fi, X_test_fi, y_train_fi, y_test_fi = train_test_split(X_fi, y_fi, test_size=0.2, random_state=0) 

In [None]:
xgb.fit(X_train_fi, y_train_fi)
y_pred_fi = list(xgb.predict(X_test_fi))

In [None]:
xgb.best_estimator_

In [None]:
xgb.best_score_

### Result
* Accuracy = 0.8440460947503201