# Ensemble and Stacking technique in Titanik competition

In [1]:
# Load in libraries

In [2]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Goint to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.linear_model import LinearRegression

ModuleNotFoundError: No module named 'xgboost'

## PDA, Feature engineering and cleaning

In [None]:
train = pd.read_csv('./Dataset/train.csv')
test = pd.read_csv('./Dataset/test.csv')
train.head(20)

In [None]:
dataset = pd.concat([train,test],axis=0,sort=True)

In [None]:
print(train.shape)
print(test.shape)
print(dataset.shape)

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
dataset['Fare'][dataset['Fare'].isnull()] = dataset['Fare'].mean()
dataset['Fare'].isnull().sum()

In [None]:
dataset['Embarked'].value_counts()

In [None]:
dataset['Embarked'][dataset['Embarked'].isnull()] = 'S'

In [None]:
dataset['Embarked'].isnull().sum()

In [None]:
sns.heatmap(dataset.corr(), annot=True)

In [None]:
# Age prediction part
age_train_x = pd.concat([dataset['Fare'], dataset['Parch'], dataset['Pclass'], dataset['SibSp']], axis=1)
age_test_x = age_train_x[dataset['Age'].isnull()]
age_train_x = age_train_x[dataset['Age'].notnull()]
age_train_y = dataset['Age'][dataset['Age'].notnull()]
print(dataset['Age'].isnull())

In [None]:
dataset.isnull().sum()

In [None]:
age_train_x.isnull().sum()

In [None]:
age_train_y.isnull().sum()

In [None]:
print(age_train_x.shape)
print(age_train_y.shape)
print(len(age_train_x))
age_train_y.head()
age_train_y.astype('int')

In [None]:
linear_Regression = LinearRegression(fit_intercept=True, n_jobs=None)
linear_Regression.fit(age_train_x, age_train_y)
predict=linear_Regression.predict(age_test_x)
predict = abs(predict)
print(predict)

In [None]:
dataset['Age'][dataset['Age'].isnull()] = predict

In [None]:
dataset.isnull().sum()

In [None]:
dataset.head()

In [None]:
dataset['Sex']

In [None]:
dataset['Name'].head()

In [None]:
dataset['Name_length'] = dataset['Name'].apply(len)
dataset['Has_Cabin'] = dataset['Cabin'].apply(lambda x: 0 if type(x) == float else 1)
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())
dataset['CategoricalFare'] = pd.qcut(dataset['Fare'], 4)
dataset['Age'] = dataset['Age'].astype(int)
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""
dataset['Title'] = dataset['Name'].apply(get_title)
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

dataset['Sex'] = dataset['Sex'].map({'female' : 0, 'male' : 1}).astype(int)
title_mapping = {'Mr' : 1, 'Miss':2, 'Mrs' : 3, 'Master' : 4, 'Rare' : 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)

dataset['Embarked'] = dataset['Embarked'].map({'S' : 0, 'C':1, 'Q' : 2}).astype(int)
dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)

dataset.head()

In [None]:
dataset.loc[dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 64) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[dataset['Age'] > 64, 'Age'] = 4

drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
dataset = dataset.drop(drop_elements, axis=1)

In [None]:
dataset.head()

In [None]:
plt.figure(figsize=(14,12))
sns.heatmap(dataset.corr(),annot=True)

In [None]:
dataset.shape

In [None]:
train_dataset = dataset.iloc[:len(train)]
test_dataset = dataset.iloc[len(train):]

In [None]:
print(train_dataset.shape)
print(test_dataset.shape)

In [None]:
dataset.head()

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:
seed = 0
nfolds = 5
train_dataset.shape[0]
ntrain = train_dataset.shape[0]
ntest = test.shape[0]

In [None]:
kfold = KFold(shuffle = True, random_state = seed)
class SKlearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
    
    def feature_importances(self,x, y):
        print(self.clf.fit(x,y).feature_importances_)
    
    

In [4]:
def get_iff(clf, x_train, y_train, x_test):
    off_train = np.zeros((ntrain,))
    off_test = np.zeros((ntest,))
    off_test_skf = np.empty((nfolds, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        off_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        
    off_test[:] = off_test_skf.mean(axis=0)
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)


In [5]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [None]:
rf = SklearnHelper(clf = RandomForestClassifier, seed = seed, params = rf_params)
