In [759]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.style.use('ggplot')
import seaborn as sns
sns.set(style="darkgrid")

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

# use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

# loading data
train_df = pd.read_csv('/Users/jimmylin/input/train.csv')
train_df.name = "train"
test_df = pd.read_csv('/Users/jimmylin/input/test.csv')
test_df.name = "test"
all_data = [train_df, test_df]


ModuleNotFoundError: No module named 'sklearn.cross_validation'

In [None]:
for dataset in all_data:
    print(dataset.name, ":")
    print(dataset.columns)
    print("\n")

In [None]:
for dataset in all_data:
    print(dataset.name, ":")
    dataset.info()
    print("\n")

In [None]:
for dataset in all_data:
    dataset["Age"][np.isnan(dataset['Age'])] = dataset["Age"].mean()
    dataset["Embarked"].fillna(dataset["Embarked"].dropna().mode()[0], inplace=True)

In [None]:
for dataset in all_data:
    print(dataset.name, ":")
    dataset.info()
    print("\n")

In [None]:
train_df.describe()

In [None]:
train_df.describe(include="O")

In [None]:
train_df.sample(5)

In [None]:
train_df[["Survived", "Pclass"]].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Survived", "SibSp"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Survived", "Parch"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in all_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
for dataset in all_data:
    dataset['isAlone'] = np.where(dataset['FamilySize']==1, 1.0, 0.0)
    
train_df[['isAlone', 'Survived']].groupby(['isAlone'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Survived", "Sex"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
train_df[["Survived", "Embarked"]].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
g = sns.FacetGrid(train_df, col="Survived")
g.map(plt.hist, "Age", bins=20)

In [None]:
g = sns.FacetGrid(train_df, col="Pclass", row="Sex", hue="Survived")
g.map(plt.hist, "Age", bins=20)

In [None]:
dropList = ['SibSp','Parch','Ticket','Cabin',"FamilySize"]
train_df = train_df.drop(dropList, axis=1)
test_df = test_df.drop(dropList, axis=1)

In [None]:
train_df['Sex'] = train_df["Sex"].map({"male":0, "female":1}).astype(int)
test_df['Sex'] = test_df["Sex"].map({"male":0, "female":1}).astype(int)

train_df['Embarked'] = train_df["Embarked"].map({"C":0, "Q":1, "S":2}).astype(int)
test_df['Embarked'] = test_df["Embarked"].map({"C":0, "Q":1, "S":2}).astype(int)

In [None]:
for dataset in all_data:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
    
train_df.head()

In [None]:
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

In [None]:
for dataset in all_data:
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3


In [None]:
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)

    if title_search:
        return title_search.group(1)
    return ""

for dataset in all_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
    

In [None]:
for dataset in all_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
print (train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())

In [None]:
for dataset in all_data:
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)


In [None]:
for dataset in all_data:
    dataset.drop(['Name'],axis=1, inplace=True)


In [None]:
test_df

In [None]:
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(random_state=SEED)

# Write some Python helper functions that collects a lot of the SKlearn methods under one roof. 
# Totally ripped from Faron's Stacking starter ;)
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)


def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    i = 0
    for train_index, test_index in kf.split(2):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        i+

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    
# Assign the parameters for each of our 4 base models
rf_params = {
    'n_jobs': -1,
    'n_estimators': 575,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 3 
}
et_params = {
    'n_jobs': -1,
    'n_estimators':575,
    #'max_features': 0.5,
    'max_depth': 5,
    'min_samples_leaf': 3,
    'verbose': 3
}
ada_params = {
    'n_estimators': 575,
    'learning_rate' : 0.95
}

gb_params = {
    'n_estimators': 575,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 3,
    'verbose': 3
}
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }
