# Simple Logistic Regression Analysis
* Data: Titanic Disaster Survival Data from Kaggle
* Link: [survival data](https://www.kaggle.com/c/titanic)

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

%matplotlib inline

print('Scikit-learn:\t', sklearn.__version__)
print('Numpy:\t\t', np.__version__)
print('Pandas:\t\t', pd.__version__)
print('Matplotlib:\t', mpl.__version__)

Scikit-learn:	 0.19.0
Numpy:		 1.13.1
Pandas:		 0.20.3
Matplotlib:	 2.0.2


## Read Data

In [2]:
train = pd.read_csv('./data/Titanic_Data/train.csv')
test = pd.read_csv('./data/Titanic_Data/test.csv')

## Exploratory Data Analysis

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [7]:
# Null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [9]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [11]:
# define numerical and categorical variables
num_cols = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [12]:
for name in num_cols:
    print('{0:12s} train: {1:5d} \t test: {2:5d}'.format(name, len(train[name].unique()), 
                                                         len(test[name].unique())))

PassengerId  train:   891 	 test:   418
Pclass       train:     3 	 test:     3
Age          train:    89 	 test:    80
SibSp        train:     7 	 test:     7
Parch        train:     7 	 test:     8
Fare         train:   248 	 test:   170


In [13]:
# unique values
for name in cat_cols:
    print('{0:12s} train: {1:5d} \t test: {2:5d}'.format(name, len(train[name].unique()), 
                                                         len(test[name].unique())))

Name         train:   891 	 test:   418
Sex          train:     2 	 test:     2
Ticket       train:   681 	 test:   363
Cabin        train:   148 	 test:    77
Embarked     train:     4 	 test:     3


In [14]:
train.groupby(['Pclass']).count().sort_index() / len(train)

Unnamed: 0_level_0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.242424,0.242424,0.242424,0.242424,0.208754,0.242424,0.242424,0.242424,0.242424,0.197531,0.24018
2,0.20651,0.20651,0.20651,0.20651,0.194164,0.20651,0.20651,0.20651,0.20651,0.017957,0.20651
3,0.551066,0.551066,0.551066,0.551066,0.398429,0.551066,0.551066,0.551066,0.551066,0.013468,0.551066


In [15]:
test.groupby(['Pclass']).count().sort_index() / len(test)

Unnamed: 0_level_0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.255981,0.255981,0.255981,0.23445,0.255981,0.255981,0.255981,0.255981,0.191388,0.255981
2,0.222488,0.222488,0.222488,0.210526,0.222488,0.222488,0.222488,0.222488,0.016746,0.222488
3,0.521531,0.521531,0.521531,0.349282,0.521531,0.521531,0.521531,0.519139,0.009569,0.521531


In [16]:
train.groupby(['SibSp']).count().sort_index() / len(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,Parch,Ticket,Fare,Cabin,Embarked
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.682379,0.682379,0.682379,0.682379,0.682379,0.52862,0.682379,0.682379,0.682379,0.141414,0.680135
1,0.234568,0.234568,0.234568,0.234568,0.234568,0.205387,0.234568,0.234568,0.234568,0.077441,0.234568
2,0.031425,0.031425,0.031425,0.031425,0.031425,0.028058,0.031425,0.031425,0.031425,0.006734,0.031425
3,0.017957,0.017957,0.017957,0.017957,0.017957,0.013468,0.017957,0.017957,0.017957,0.003367,0.017957
4,0.020202,0.020202,0.020202,0.020202,0.020202,0.020202,0.020202,0.020202,0.020202,0.0,0.020202
5,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.0,0.005612
8,0.007856,0.007856,0.007856,0.007856,0.007856,0.0,0.007856,0.007856,0.007856,0.0,0.007856


In [17]:
test.groupby(['SibSp']).count().sort_index() / len(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,Parch,Ticket,Fare,Cabin,Embarked
SibSp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.677033,0.677033,0.677033,0.677033,0.511962,0.677033,0.677033,0.674641,0.107656,0.677033
1,0.263158,0.263158,0.263158,0.263158,0.232057,0.263158,0.263158,0.263158,0.098086,0.263158
2,0.033493,0.033493,0.033493,0.033493,0.026316,0.033493,0.033493,0.033493,0.009569,0.033493
3,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.002392,0.009569
4,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.009569,0.0,0.009569
5,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.0,0.002392
8,0.004785,0.004785,0.004785,0.004785,0.002392,0.004785,0.004785,0.004785,0.0,0.004785


In [18]:
train.groupby(['Parch']).count().sort_index() / len(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Ticket,Fare,Cabin,Embarked
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.760943,0.760943,0.760943,0.760943,0.760943,0.584736,0.760943,0.760943,0.760943,0.158249,0.758698
1,0.132435,0.132435,0.132435,0.132435,0.132435,0.123457,0.132435,0.132435,0.132435,0.043771,0.132435
2,0.089787,0.089787,0.089787,0.089787,0.089787,0.076319,0.089787,0.089787,0.089787,0.025814,0.089787
3,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.0,0.005612
4,0.004489,0.004489,0.004489,0.004489,0.004489,0.004489,0.004489,0.004489,0.004489,0.001122,0.004489
5,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.005612,0.0,0.005612
6,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.0,0.001122


In [19]:
test.groupby(['Parch']).count().sort_index() / len(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Ticket,Fare,Cabin,Embarked
Parch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.77512,0.77512,0.77512,0.77512,0.590909,0.77512,0.77512,0.772727,0.15311,0.77512
1,0.124402,0.124402,0.124402,0.124402,0.119617,0.124402,0.124402,0.124402,0.04067,0.124402
2,0.078947,0.078947,0.078947,0.078947,0.069378,0.078947,0.078947,0.078947,0.014354,0.078947
3,0.007177,0.007177,0.007177,0.007177,0.007177,0.007177,0.007177,0.007177,0.007177,0.007177
4,0.004785,0.004785,0.004785,0.004785,0.002392,0.004785,0.004785,0.004785,0.002392,0.004785
5,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.0,0.002392
6,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.002392,0.0,0.002392
9,0.004785,0.004785,0.004785,0.004785,0.0,0.004785,0.004785,0.004785,0.0,0.004785


In [20]:
train.groupby(['Sex']).count().sort_index() / len(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
female,0.352413,0.352413,0.352413,0.352413,0.292929,0.352413,0.352413,0.352413,0.352413,0.108866,0.350168
male,0.647587,0.647587,0.647587,0.647587,0.508418,0.647587,0.647587,0.647587,0.647587,0.12009,0.647587


In [21]:
test.groupby(['Sex']).count().sort_index() / len(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
female,0.363636,0.363636,0.363636,0.303828,0.363636,0.363636,0.363636,0.363636,0.105263,0.363636
male,0.636364,0.636364,0.636364,0.490431,0.636364,0.636364,0.636364,0.633971,0.11244,0.636364


In [22]:
train.groupby(['Embarked']).count().sort_index() / len(train)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,0.188552,0.188552,0.188552,0.188552,0.188552,0.145903,0.188552,0.188552,0.188552,0.188552,0.077441
Q,0.08642,0.08642,0.08642,0.08642,0.08642,0.031425,0.08642,0.08642,0.08642,0.08642,0.004489
S,0.722783,0.722783,0.722783,0.722783,0.722783,0.621773,0.722783,0.722783,0.722783,0.722783,0.144781


In [23]:
test.groupby(['Embarked']).count().sort_index() / len(test)

Unnamed: 0_level_0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C,0.244019,0.244019,0.244019,0.244019,0.196172,0.244019,0.244019,0.244019,0.244019,0.117225
Q,0.110048,0.110048,0.110048,0.110048,0.052632,0.110048,0.110048,0.110048,0.110048,0.002392
S,0.645933,0.645933,0.645933,0.645933,0.545455,0.645933,0.645933,0.645933,0.643541,0.098086


## Feature Engineering

In [24]:
# combine train and test data
all_data = pd.concat([train, test])

In [25]:
all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [26]:
all_data.tail()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
413,,,S,8.05,"Spector, Mr. Woolf",0,1305,3,male,0,,A.5. 3236
414,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1306,1,female,0,,PC 17758
415,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,1307,3,male,0,,SOTON/O.Q. 3101262
416,,,S,8.05,"Ware, Mr. Frederick",0,1308,3,male,0,,359309
417,,,C,22.3583,"Peter, Master. Michael J",1,1309,3,male,1,,2668


In [27]:
# create Title variable
all_data['Title'] = all_data['Name'].apply(lambda x: re.split('[.,]', x)[1].strip())
pd.crosstab(all_data['Title'], all_data['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,4
Don,0,1
Dona,1,0
Dr,1,7
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,61
Miss,260,0


In [28]:
# one special case
all_data.loc[(all_data['Sex'] == 'female') & (all_data['Title'] == 'Dr'), 'Title'] = 'Mrs'

In [29]:
# manually fix the title error
male_name = ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir', 'Dr']
famale_name = ['Dona', 'Lady', 'Mlle', 'Mme', 'Ms', 'the Countess']
all_data['Title'].replace(male_name, 'Mr', inplace=True)
all_data['Title'].replace(famale_name, 'Mrs', inplace=True)

In [30]:
# encoder Title column
le = LabelEncoder()
all_data['Title'] = le.fit_transform(all_data['Title'])

In [31]:
pd.crosstab(all_data['Title'], all_data['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,61
1,260,0
2,0,782
3,206,0


In [32]:
# encoder the sex column
all_data['Sex'] = (all_data['Sex'] == 'male').astype('int')

In [33]:
# create Ticket binary variable
all_data['abs_col'] = (all_data['Ticket'].apply(lambda x: len(x.split())) > 1).astype(int)

### fill NA values

In [34]:
# replace missing Age with median
all_data.loc[all_data['Age'].isnull(), 'Age'] = all_data['Age'].median(skipna=True)

In [35]:
# # drop missing value for Embarked
# all_data = all_data[~all_data['Embarked'].isnull()]

In [36]:
# replace missing Fare with median
all_data.loc[all_data['Fare'].isnull(), 'Fare'] = all_data['Fare'].median(skipna=True)

In [37]:
# completely drop Cabin
all_data = all_data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId', 'Embarked'], axis=1)

In [38]:
# transform Age into log
all_data['Fare'] = np.log(all_data['Fare'] + 1)

In [39]:
all_data.isnull().sum()

Age           0
Fare          0
Parch         0
Pclass        0
Sex           0
SibSp         0
Survived    418
Title         0
abs_col       0
dtype: int64

In [40]:
# change Paerch == 9 to be 0
all_data.loc[all_data['Parch'] == 9, 'Parch'] = 0

In [41]:
all_data.head()

Unnamed: 0,Age,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,abs_col
0,22.0,2.110213,0,3,1,1,0.0,2,1
1,38.0,4.280593,0,1,0,1,1.0,3,1
2,26.0,2.188856,0,3,0,0,1.0,1,1
3,35.0,3.990834,0,1,0,1,1.0,3,0
4,35.0,2.202765,0,3,1,0,0.0,2,0


In [42]:
all_data.tail()

Unnamed: 0,Age,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,abs_col
413,28.0,2.202765,0,3,1,0,,2,1
414,39.0,4.699571,0,1,0,0,,3,1
415,38.5,2.110213,0,3,1,0,,2,1
416,28.0,2.202765,0,3,1,0,,2,0
417,28.0,3.150952,1,3,1,1,,0,0


In [43]:
# get train and test data set
train_tmp = all_data[~all_data['Survived'].isnull()]
test_tmp = all_data[all_data['Survived'].isnull()]

## Fit logistic regression model

In [44]:
train_x = train_tmp[['Age', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Title', 'abs_col']]
train_y = train_tmp['Survived']
test_x = test_tmp[['Age', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Title', 'abs_col']]

In [45]:
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                           intercept_scaling=1, class_weight=None, random_state=None, 
                           solver='liblinear', max_iter=100, multi_class='ovr', 
                           verbose=0, warm_start=False, n_jobs=1)

model.fit(train_x, train_y)

train_y_pred = model.predict(train_x).astype(int)
test_y_pred = model.predict(test_x).astype(int)
train_y_prob = model.predict_proba(train_x)[:, 1]
test_y_prob = model.predict_proba(test_x)[:, 1]

In [46]:
model.score(train_x, train_y)

0.79797979797979801

In [47]:
# Make submission file
result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': test_y_pred})
result.to_csv('./results/Titanic.csv', index=False)

## Cross validation

In [48]:
train_x = train_tmp[['Age', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Title', 'abs_col']]
train_y = train_tmp['Survived']
test_x = test_tmp[['Age', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Title', 'abs_col']]

In [49]:
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                           intercept_scaling=1, class_weight=None, random_state=None, 
                           solver='liblinear', max_iter=100, multi_class='ovr', 
                           verbose=0, warm_start=False, n_jobs=1)
scores = cross_val_score(model, train_x, train_y, cv=5, scoring='accuracy')
print('Accuracy:\t', *scores)

Accuracy:	 0.782122905028 0.793296089385 0.775280898876 0.780898876404 0.813559322034


In [50]:
model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, 
                           intercept_scaling=1, class_weight=None, random_state=None, 
                           solver='liblinear', max_iter=100, multi_class='ovr', 
                           verbose=0, warm_start=False, n_jobs=1)
scores = cross_val_score(model, train_x, train_y, cv=5, scoring='roc_auc')
print('AUC:\t', *scores)

AUC:	 0.836890645586 0.836956521739 0.840240641711 0.849398395722 0.88262277388


In [51]:
for i in np.linspace(0, 1, 100):
    pred = (train_y_prob > i).astype(int)
    accuracy = np.sum(pred == train_y) / len(pred)
    print(i, '\t', accuracy)

0.0 	 0.383838383838
0.010101010101 	 0.388327721661
0.020202020202 	 0.389450056117
0.030303030303 	 0.391694725028
0.040404040404 	 0.396184062851
0.0505050505051 	 0.414141414141
0.0606060606061 	 0.420875420875
0.0707070707071 	 0.443322109989
0.0808080808081 	 0.460157126824
0.0909090909091 	 0.485970819304
0.10101010101 	 0.514029180696
0.111111111111 	 0.593714927048
0.121212121212 	 0.631874298541
0.131313131313 	 0.664421997755
0.141414141414 	 0.683501683502
0.151515151515 	 0.693602693603
0.161616161616 	 0.708193041526
0.171717171717 	 0.712682379349
0.181818181818 	 0.725028058361
0.191919191919 	 0.728395061728
0.20202020202 	 0.738496071829
0.212121212121 	 0.738496071829
0.222222222222 	 0.744107744108
0.232323232323 	 0.74859708193
0.242424242424 	 0.755331088664
0.252525252525 	 0.762065095398
0.262626262626 	 0.771043771044
0.272727272727 	 0.775533108866
0.282828282828 	 0.776655443322
0.292929292929 	 0.778900112233
0.30303030303 	 0.778900112233
0.313131313131 	 0

In [52]:
# Make submission file
pred = (test_y_prob > 0.545454545455).astype(int)
result = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred})
result.to_csv('./results/Titanic2.csv', index=False)