In [235]:
import pandas as pd
import numpy as np
import os

TITANIC_PATH = '/Users/IlanReinstein/Portfolio/titanic/'

def load_train_data(train_path=TITANIC_PATH):
    csv_path = os.path.join(train_path, "train.csv")
    return pd.read_csv(csv_path)
def load_test_data(test_path = TITANIC_PATH):
    csv_path = os.path.join(test_path, 'test.csv')
    return pd.read_csv(csv_path)

In [236]:
train = load_train_data()
#X_train = train.values[:-1]
#y_train = train.values[-1]
test = load_test_data()
#X_test = test.values[:-1]
#y_test = test.values[-1]
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [237]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    print big_string
    return np.nan

def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

#Extracting Title from the Name column
title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

train['Title'] = train.Name.map(lambda x: substrings_in_string(x, title_list))
train.Title = train.apply(replace_titles, axis=1)

#Extracting the cabin type to establish the deck of the passenger.
train.Cabin = train.Cabin.replace(np.nan, 'Unknown')
train.Embarked = train.Embarked.replace(np.nan, 'Unknown')
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train['Deck']=train['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

#Add new Features
train['family_size'] = train.SibSp + train.Parch

#Drop unnecesary columns and maintain
train_clean = train.drop(['Name', 'Ticket','Cabin'], axis = 1)
train_clean.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Deck,family_size
0,1,0,3,male,22.0,1,0,7.25,S,Mr,Unknown,1
1,2,1,1,female,38.0,1,0,71.2833,C,Mrs,C,1
2,3,1,3,female,26.0,0,0,7.925,S,Miss,Unknown,0
3,4,1,1,female,35.0,1,0,53.1,S,Mrs,C,1
4,5,0,3,male,35.0,0,0,8.05,S,Mr,Unknown,0


In [238]:
#cat_attr = ['Pclass', 'Sex', 'Title', 'Embarked', 'Deck']
num_attr = ['Age', 'Fare', 'family_size', 'SibSp', 'Parch']
cat_attr = ['Sex', 'Title', 'Deck', 'Embarked']

train_cat = train_clean[cat_attr]
train_num = train_clean[num_attr]
 

from sklearn.preprocessing import LabelBinarizer, OneHotEncoder, LabelEncoder
binar = LabelBinarizer()
encod = LabelEncoder()
one_hot = OneHotEncoder()
train_cat.Sex = binar.fit_transform(train_cat.Sex)
train_cat.Embarked = encod.fit_transform(train_clean.Embarked)
train_cat.Title = encod.fit_transform(train_clean.Title)
train_cat.Deck = encod.fit_transform(train_clean.Deck)

train_cat_clean = one_hot.fit_transform(train_cat)
train_cat_clean = pd.DataFrame(train_cat_clean.toarray())

In [239]:
median = train_num.Age.median()
train_num.Age = train_num.Age.fillna(median)

train_num.Fare = train_num.Fare.replace(0,1)
train_num.loc[:,'log_fare'] = np.log(train_num.Fare)

In [244]:
train_clean_final = pd.concat([train_cat_clean,train_num, train_clean.Survived], axis = 1)

In [245]:
train_clean_final.corr()['Survived']

0              0.543351
1             -0.543351
2              0.085221
3              0.332817
4             -0.563511
5              0.344650
6              0.022287
7              0.175095
8              0.114652
9              0.150716
10             0.150716
11             0.047930
12             0.016040
13            -0.026456
14            -0.316912
15             0.168240
16             0.003650
17            -0.155660
18             0.060095
Age           -0.064910
Fare           0.257141
family_size    0.016639
SibSp         -0.035322
Parch          0.081629
log_fare       0.331805
Survived       1.000000
Name: Survived, dtype: float64

In [243]:
X_train = train_clean_final.values
y_train = train_clean_final.Survived

In [246]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [248]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [258]:
from sklearn.model_selection import cross_val_score, cross_val_predict
cross_val_score(sgd_clf, X_train, y_train, cv = 5, scoring = 'accuracy')

array([ 0.62011173,  0.61452514,  0.7752809 ,  0.75280899,  0.70621469])

In [260]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv = 5)
precision_score(y_train, y_train_pred)


0.6619718309859155