### Set Up

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib as plt

### Load Data

In [None]:
import os
import pandas as pd

PROJECT_ROOT_DIR = "."
DATA_PATH = "/data/"
TRAIN_DATA_PATH = PROJECT_ROOT_DIR + DATA_PATH

def load_data(housing_path=TRAIN_DATA_PATH, file_name = "train.csv"):
    csv_path = os.path.join(housing_path, file_name)
    return pd.read_csv(csv_path)

In [None]:
raw_data = load_data()
raw_data.describe()

In [None]:
test_data = load_data(file_name="test.csv")

In [None]:
y_train = raw_data['Survived']
X_train = raw_data.drop(['Survived'], axis=1)

In [None]:
raw_data.info()

**PLot a histgram for each numberical attributes**

In [None]:
raw_data.hist(bins=50, figsize=(20,15))

In [None]:
raw_data_copy = raw_data.copy()
raw_data_copy['Pclass'].value_counts()

In [None]:
data_class1 = raw_data_copy[raw_data_copy.Pclass == 1]
data_class1['Survived'].value_counts()

In [None]:
data_class2 = raw_data_copy[raw_data_copy.Pclass == 2]
data_class2['Survived'].value_counts()

In [None]:
data_class3 = raw_data_copy[raw_data_copy.Pclass == 3]
data_class3['Survived'].value_counts()

In [None]:
def survival_rate_by_pclass(data=raw_data_copy, class_level=1):
    data_class_i = data[data.Pclass == class_level]
    return len(data_class_i[data_class_i.Survived == 1]) / len(data_class_i)
   
survival_rate = {'class_1':0, 'class_2':0, 'class_3':0}
for i in range(3):
    index = 'class_' + str(i+1)
    survival_rate[index] = survival_rate_by_pclass(class_level=i+1)

In [None]:
survival_rate

In [None]:
raw_data_copy.info()

In [None]:
corr_matrix = raw_data_copy.corr()

In [None]:
corr_matrix['Survived'].sort_values(ascending=False)

In [None]:
corr_matrix['Pclass'].sort_values(ascending=False)

In [None]:
survival_rate

In [None]:
from pandas.plotting import scatter_matrix

attributes = ['Pclass', 'Age', 'Survived', 'Fare']
scatter_matrix(raw_data_copy[attributes], figsize=(12,8))

**Data Cleaning**

In [None]:
num_attributes = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
raw_data_num = raw_data[num_attributes]

cat_attributes = ['Sex', 'Embarked', 'Pclass']
raw_data_cat = raw_data[cat_attributes]

raw_data_num.info()


**Fill N/A data with median for numberic attributes**

In [None]:
# median = raw_data_num['Age'].median()
# raw_data_num['Age'].fillna(median)

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy='median')
imputer.fit(raw_data_num)

X = imputer.transform(raw_data_num)
pd_X = pd.DataFrame(X, columns=raw_data_num.columns)
pd_X.info()

In [None]:
raw_data_cat['Pclass'].value_counts()

In [None]:
raw_data_cat['Embarked'].value_counts()

In [None]:
raw_data_cat['Sex'].value_counts()

In [None]:
raw_data_cat.info()

In [None]:
# raw_data_cat1 = raw_data_cat.dropna(subset=['Embarked']).drop('Cabin', axis=1).drop('Name', axis=1).drop('Ticket', axis=1)
raw_data_cat1 = raw_data_cat

In [None]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
raw_embarked = raw_data_cat1['Pclass'].astype(str)
raw_data_cat_1hot_embarked = encoder.fit_transform(raw_embarked)
raw_data_cat_1hot_embarked


In [None]:
raw_sex = raw_data_cat1['Sex'].astype(str)
raw_data_cat_1hot_sex = encoder.fit_transform(raw_sex)
# raw_data_cat_1hot_sex

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [None]:
from sklearn.pipeline import Pipeline
try:
    from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:
    from sklearn.preprocessing import Imputer as SimpleImputer
attributes_num = ["Age", "SibSp", "Parch", "Fare"]
num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(attributes_num)),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [None]:
num_pipeline.fit_transform(X_train)

In [None]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [None]:
try:
    from sklearn.preprocessing import OrdinalEncoder # just to raise an ImportError if Scikit-Learn < 0.20
    from sklearn.preprocessing import OneHotEncoder
except ImportError:
    from future_encoders import OneHotEncoder # Scikit-Learn < 0.20

In [None]:
attributes_cat = ["Pclass", "Sex", "Embarked"]
cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(attributes_cat)),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [None]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [None]:
X_train1 = preprocess_pipeline.fit_transform(X_train)
X_train1

In [None]:
y_train1 = raw_data['Survived']

### Train a classifier with *SVC*

In [None]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train1, y_train)

In [None]:
X_test = preprocess_pipeline.transform(test_data)
y_pred = svm_clf.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train1, y_train1, cv=10)
svm_scores.mean()

In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(test_data['Survived'], y_pred)

### Train a new classifier with *RandomForest*

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train1, y_train1, cv=10)
forest_scores.mean()

In [None]:
feature_importances = svm_clf.feature_importances_
feature_importances

In [None]:
forest_clf.fit(X_train1, y_train1)

In [None]:
feature_importances_rf = forest_clf.feature_importances_
attributes = attributes_num + attributes_cat
sorted(zip(feature_importances_rf, attributes), reverse=True)