# Titanic - Machine Learning from Disaster

It is the [Titanic](https://www.kaggle.com/c/titanic/) competition from Kaggle. Download all the data from kaggle and put it in <i>titanic</i> folder.

This notebook uses feature selection techniques. 


## 2. Read from CSV

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import seaborn as sns

%matplotlib inline
sns.set_theme()

In [None]:
training = pd.read_csv("titanic/train.csv")
# Survived column is not at the end
training['Survived'] = training.pop('Survived')
training.head()

### 2.2 Encoding data

[How to handle categorical data in scikit with pandas](https://www.kaggle.com/getting-started/27270)

In [None]:
from sklearn.preprocessing import RobustScaler, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler

In [None]:
titan_oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# https://www.geeksforgeeks.org/standardscaler-minmaxscaler-and-robustscaler-techniques-ml/
titan_sc = MinMaxScaler(feature_range = (0,1))

In [None]:
def encode_data(X, train=True):
    global titan_oe, norm_sc

    def is_alone(a, b):
        if a + b == 0:
            return 1
        return 0
    
    def is_minor(s, a):
        if a < 17:
            return 0
        elif s == "female":
            return 1
        else: return 2


    X['is_m'] = X.apply(lambda x: is_minor(x.Sex, x.Age), axis=1)
    X['is_a'] = X.apply(lambda x: is_alone(x.SibSp, x.Parch), axis=1)

    if train:
        titan_sc.fit(X[["Age", "Fare"]])
    X[["Age", "Fare"]] = titan_sc.transform(X[["Age", "Fare"]])
    
    if train:
        titan_oe.fit(X[["Sex", "Embarked"]])
    X[["Sex", "Embarked"]] = titan_oe.transform(X[["Sex", "Embarked"]])
    X["Embarked"] = X["Embarked"] + 1
    
    
    X = X.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
    
    # if train:
    #     titan_sc.fit(X)
    # return pd.DataFrame(data=titan_sc.transform(X), columns=X.columns)

    return X

In [None]:
e_X = encode_data(training.copy().iloc[:, :-1])

e_X['Survived'] = training['Survived']
e_X.head()

In [None]:
# profile = ProfileReport(e_X)
# profile

### 2.3 Remove NaN

In [None]:
e_X.info()

In [None]:
print('Amount of missing values in each column: ')
e_X.isnull().sum()

In [None]:
age_mean_a = 0
age_mean = 0
fare_mean = 0

In [None]:
def remove_nan(X, train=True):
    global age_mean, age_mean_a, fare_mean
    
    if train:
        age_mean_a = X[(X["is_a"]==1)]["Age"].mean()
        age_mean = X[(X["is_a"]==0)]["Age"].mean()
    
    mask = X["Age"].isna()
    X.loc[mask, "Age"] = np.where(X.loc[mask, "is_a"].eq(1), age_mean_a, age_mean)
    
    if train:
        fare_mean = X["Fare"].mean()
    X["Fare"].fillna(fare_mean, inplace=True)
    
    X["Embarked"].fillna(0, inplace=True)
    X[["Sex", "Embarked"]] = X[["Sex", "Embarked"]].astype(int)
    # X[["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "is_m", "is_a", "Survived"]] = X[["Pclass", "Age", "SibSp", "Parch", "Fare", "Embarked", "is_m", "is_a", "Survived"]].astype(int)

    return X

In [None]:
e_X = remove_nan(e_X)
e_X[e_X.isnull().any(axis=1)].head()

In [None]:
# (e_X < 0).any()

In [None]:
# profile = ProfileReport(e_X)
# profile

### 2.4 data, class division

In [None]:
X, y = e_X.iloc[:, :-1], e_X.iloc[:, -1]

## 2. Feature selection

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
ss = MinMaxScaler(feature_range = (0,1))
X_scaled = ss.fit_transform(X)
# X_test_scaled = ss.transform(X_test)

### Univariate Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest( score_func=chi2, k=len(X.columns) )
fit = bestfeatures.fit(X_scaled,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_scaled, y)

importances = pd.DataFrame(data={
    'Attribute': X.columns,
    'Importance': model.coef_[0]
})
importances = importances.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8,8))

plt.bar(x=importances['Attribute'], height=importances['Importance'])
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

### Feature Importance

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_scaled, y)
 
#use inbuilt class feature_importances of tree based classifiers
for c, f in zip(X.columns, model.feature_importances_):
    print('{:10s}: {:.2f}'.format(c, f*100) )

plt.figure(figsize=(8,8))
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest( len(X.columns) ).plot(kind='bar')
plt.show()


In [None]:
# ! pip install xgboost

In [None]:
from collections import Counter

# count examples in each class
counter = Counter(y)

# estimate scale_pos_weight value
sp_weight = counter[0] / counter[1]
print('Estimate: %.3f' % sp_weight)

In [None]:
from xgboost import XGBClassifier

XGB_model = XGBClassifier(scale_pos_weight=sp_weight)
XGB_model.fit(X_scaled, y)
importances = pd.DataFrame(data={
    'Attribute': X.columns,
    'Importance': XGB_model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(8,8))

plt.bar(x=importances['Attribute'], height=importances['Importance'])
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
fig1 = plt.figure(figsize=(18,12))
fig1.subplots_adjust(wspace=.05, hspace=0.25)

for i, f in enumerate(["weight", "gain", "cover", "total_gain", "total_cover"], start=1):
    ax = fig1.add_subplot(2, 3, i)

    h = [v for k, v in XGB_model.get_booster().get_score(importance_type= f).items()]
    ax.bar(x=X.columns, height=h)
    plt.xticks(rotation='vertical')
    plt.title(" ".join(f.split("_") ).capitalize(), size=20)

ax = fig1.add_subplot(2, 3, 6)

h = XGB_model.feature_importances_
ax.bar(x=X.columns, height=h)
plt.xticks(rotation='vertical')
plt.title("Importance", size=20)

fig1.suptitle('Feature importance obtained from coefficients', size=30)

plt.tight_layout()
plt.show()

### Correlation Matrix with Heatmap

In [None]:
corrmat = e_X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(8,8))

#plot heat map
g=sns.heatmap(e_X[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
plt.figure(figsize=(8,8))

# Draw the scatter plot
plt.bar(corrmat['Survived'].index[:-1], corrmat['Survived'][:-1])
plt.xticks(rotation='vertical')
plt.show()