----

In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
import sklearn.preprocessing

from utils import *

In [2]:
# Load in the train and test datasets
prefix = 'data/'
train = pd.read_csv(prefix + 'train.csv')
test = pd.read_csv(prefix + 'test.csv')

# Store our passenger ID for easy access
PassengerId = test['PassengerId']

train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
full_data = [train, test]

# Some features of my own that I have added in
# Gives the length of the name
train['Name_length'] = train['Name'].apply(len)
test['Name_length'] = test['Name'].apply(len)
# Feature that tells whether a passenger had a cabin on the Titanic
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

# Feature engineering steps taken from Sina
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;

In [4]:
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
y_train = train['Survived'].ravel()
train = train.drop(['Survived'], axis=1)

In [5]:
train.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,3,1,1,0,0,0,23,0,2,0,1
1,1,0,2,0,3,1,51,1,2,0,3
2,3,0,1,0,1,0,22,0,1,1,2
3,1,0,2,0,3,0,44,1,2,0,3
4,3,1,2,0,1,0,24,0,1,1,1


In [6]:
train_normalized = sklearn.preprocessing.normalize(train, axis=0)
normalized_train_frame = pd.DataFrame(data=train_normalized, columns=train.axes[1])
normalized_train_frame.to_csv(prefix + 'normalized_train.csv', index=False)
normalized_train_frame.head()

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Name_length,Has_Cabin,FamilySize,IsAlone,Title
0,0.040935,0.041631,0.021044,0.0,0.0,0.0,0.027021,0.0,0.026849,0.0,0.016653
1,0.013645,0.0,0.042089,0.0,0.053614,0.045835,0.059916,0.070014,0.026849,0.0,0.049958
2,0.040935,0.0,0.021044,0.0,0.017871,0.0,0.025846,0.0,0.013424,0.043153,0.033306
3,0.013645,0.0,0.042089,0.0,0.053614,0.0,0.051692,0.070014,0.026849,0.0,0.049958
4,0.040935,0.041631,0.042089,0.0,0.017871,0.0,0.028196,0.0,0.013424,0.043153,0.016653


In [7]:
test_params = make_test_data(train_normalized, y_train)

In [8]:
import catboost_test, decision_tree, fris_stolp_test, knn, logistic_regression, naive_bayes, svm, xg_boost

def test_suite(X_train, X_test, y_train, y_test):
    results = {}
    for test in catboost_test, decision_tree, fris_stolp_test, knn, logistic_regression, naive_bayes, svm, xg_boost:
        print(test.__name__)
        results[test.__name__] = test.adaptive_test(X_train, X_test, y_train, y_test)
    best_params = {}
    for x in results.items():
        best_params[x[0]] = x[1]['best_params']
    def make_dataframe(results):
        keys = [x for x in results.keys()]
        dataframe_data = {'method': keys,
                          'precision': [results[x]['precision'] for x in keys],
                          'recall': [results[x]['recall'] for x in keys],
                          'accuracy': [results[x]['accuracy'] for x in keys],
                          'f1_score': [results[x]['f1_score'] for x in keys]}
        dataframe = pd.DataFrame(dataframe_data)
        return dataframe
    return best_params, make_dataframe(results)

# Тестики!!1

In [9]:
best_params, results = test_suite(*test_params)

catboost_test
0:	learn: 0.4664880	total: 55.5ms	remaining: 500ms
1:	learn: 0.4011009	total: 64.8ms	remaining: 259ms
2:	learn: 0.3933774	total: 70.5ms	remaining: 165ms
3:	learn: 0.3772336	total: 87.2ms	remaining: 131ms
4:	learn: 0.3619580	total: 97ms	remaining: 97ms
5:	learn: 0.3543589	total: 106ms	remaining: 70.7ms
6:	learn: 0.3510108	total: 116ms	remaining: 49.7ms
7:	learn: 0.3508418	total: 120ms	remaining: 30ms
8:	learn: 0.3507974	total: 124ms	remaining: 13.8ms
9:	learn: 0.3462586	total: 133ms	remaining: 0us
decision_tree
fris_stolp_test
knn
logistic_regression
naive_bayes
svm
xg_boost


In [10]:
results

Unnamed: 0,accuracy,f1_score,method,precision,recall
0,0.843575,0.8,catboost_test,0.848485,0.756757
1,0.798883,0.73913,decision_tree,0.796875,0.689189
2,0.765363,0.730769,fris_stolp_test,0.695122,0.77027
3,0.821229,0.789474,knn,0.769231,0.810811
4,0.586592,0.0,logistic_regression,0.0,0.0
5,0.798883,0.772152,naive_bayes,0.72619,0.824324
6,0.77095,0.724832,svm,0.72,0.72973
7,0.826816,0.783217,xg_boost,0.811594,0.756757


In [11]:
best_params

{'catboost_test': {'depth': 6,
  'iterations': 10,
  'learning_rate': 0.5,
  'loss_function': 'Logloss'},
 'decision_tree': {'max_features': 0.59219623757829087, 'max_leaf_nodes': 5},
 'fris_stolp_test': {'threshold': 0.46666794420469615, 'v': 1},
 'knn': {'algorithm': 'ball_tree', 'leaf_size': 35, 'n_neighbors': 5, 'p': 1},
 'logistic_regression': {'C': 0.52096022540611742,
  'max_iter': 200,
  'tol': 7.3026935553490962e-05},
 'naive_bayes': {'alpha': 0.90823425483464049,
  'binarize': 0.019195425030808455},
 'svm': {'C': 8.5757759710475892, 'gamma': 0.55711986588014617},
 'xg_boost': {'learning_rate': 0.0080863849773172986, 'n_estimators': 300}}

In [15]:
from sklearn.datasets import load_iris
iris = load_iris()
iris.target[iris.target == 2] = 3
iris.target[iris.target == 1] = 0
iris.target[iris.target == 3] = 1
iris_train_normalized = sklearn.preprocessing.normalize(iris.data, axis=0)
iris_params = make_test_data(iris_train_normalized, np.int32(iris.target))
iris_best_params, iris_results = test_suite(*iris_params)

catboost_test
0:	learn: 0.2308366	total: 4.5ms	remaining: 18ms
1:	learn: 0.1745010	total: 8.73ms	remaining: 13.1ms
2:	learn: 0.0822891	total: 12.4ms	remaining: 8.25ms
3:	learn: 0.0035705	total: 15.9ms	remaining: 3.98ms
4:	learn: 0.0027655	total: 20ms	remaining: 0us
decision_tree
fris_stolp_test
knn
logistic_regression
naive_bayes
svm
xg_boost


In [16]:
iris_results

Unnamed: 0,accuracy,f1_score,method,precision,recall
0,0.966667,0.947368,catboost_test,0.9,1.0
1,1.0,1.0,decision_tree,1.0,1.0
2,0.966667,0.947368,fris_stolp_test,0.9,1.0
3,0.966667,0.947368,knn,0.9,1.0
4,0.7,0.0,logistic_regression,0.0,0.0
5,0.966667,0.947368,naive_bayes,0.9,1.0
6,0.9,0.857143,svm,0.75,1.0
7,0.966667,0.947368,xg_boost,0.9,1.0


In [18]:
iris_best_params

{'catboost_test': {'depth': 3,
  'iterations': 5,
  'learning_rate': 2.0,
  'loss_function': 'Logloss'},
 'decision_tree': {'max_features': 0.69437883016390556, 'max_leaf_nodes': 4},
 'fris_stolp_test': {'threshold': 0.73369053009212104, 'v': 1},
 'knn': {'algorithm': 'brute', 'leaf_size': 30, 'n_neighbors': 9, 'p': 2},
 'logistic_regression': {'C': 0.52096022540611742,
  'max_iter': 200,
  'tol': 7.3026935553490962e-05},
 'naive_bayes': {'alpha': 0.6588679040985248,
  'binarize': 0.096726555741161346},
 'svm': {'C': 7.5622648586258823, 'gamma': 0.50652896753982235},
 'xg_boost': {'learning_rate': 0.0026211785174544257, 'n_estimators': 500}}