In [1]:
import numpy as np
import pandas as pd
import keras
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from keras.layers import Dense
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
def selectFeature(df):
    data = df.copy()
    
    data['Embarked'] = data['Embarked'].fillna('C')
    
    sex_label = LabelEncoder().fit(data['Sex'].unique())
    
    data['Sex_l'] = data['Sex'].map(lambda x: sex_label.transform([x])[0])
    
    pclass_dummies = pd.get_dummies(data['Pclass'], 'Pclass')
    embarked_dummies = pd.get_dummies(data['Embarked'], 'Embarked')
    
    data = data.join(pclass_dummies, how='inner')
    data = data.join(embarked_dummies, how='inner')
    
    family_tickets = data.groupby('Ticket').filter(lambda x: len(x) > 1)['Ticket'].values
    data['Family'] = data['Ticket'].map(lambda x: x in family_tickets).astype(int)
    
    data['Age_1'] = data['Age'].map(lambda x: int(x < 10.) & (x >= 0.))
    data['Age_2'] = data['Age'].map(lambda x: int(x < 20.) & (x >= 10.))
    data['Age_3'] = data['Age'].map(lambda x: int(x < 30.) & (x >= 20.))
    data['Age_4'] = data['Age'].map(lambda x: int(x < 40.) & (x >= 30.))
    data['Age_5'] = data['Age'].map(lambda x: int(x < 50.) & (x >= 40.))
    data['Age_6'] = data['Age'].map(lambda x: int(x < 60.) & (x >= 50.))
    data['Age_7'] = data['Age'].map(lambda x: int(x < 70.) & (x >= 60.))
    data['Age_8'] = data['Age'].map(lambda x: int(x >= 80.))
    
    data = data.drop(['Ticket', 'Cabin', 'Name', 'Embarked', 'Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare'], axis=1)
    data = data.fillna(0)
        
    return data

In [4]:
train = selectFeature(train_df)
test = selectFeature(test_df)

In [5]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Sex_l,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Family,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8
0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0
1,2,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
2,3,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0
3,4,1,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0
4,5,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0


In [7]:
train.values[:5]

array([[1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [2, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [3, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [4, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0],
       [5, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]],
      dtype=int64)

In [8]:
def measure(x, y, model):
    model.fit(x, y)
    print('%0.2f' % model.score(x, y), model, '\n')
    
    return model

In [9]:
X_train = train.values[:, 2:]
Y_train = train.values[:, 1]

In [10]:
measure(X_train, Y_train, LogisticRegression())
measure(X_train, Y_train, SVC())
measure(X_train, Y_train, RandomForestClassifier())
measure(X_train, Y_train, DecisionTreeClassifier())

0.81 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

0.79 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

0.85 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

0.85 DecisionTreeClassifier(class_weight=None, criterion='gini', max

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [11]:
model = measure(X_train, Y_train, RandomForestClassifier())

0.85 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 



In [12]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': model.predict(test.values[:, 1:])
})

In [13]:
submission.to_csv('lr_submission.csv', index=False)