In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

In [2]:
# Load Titanic dataset
url = 'data/titanic.csv'
titanic = pd.read_csv(url, index_col='PassengerId')

In [3]:
titanic.head(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
titanic.dtypes

Survived      int64
Pclass        int64
Name         object
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Cabin        object
Embarked     object
dtype: object

In [5]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [6]:
titanic.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic.Age.isnull()

PassengerId
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18      True
19     False
20      True
21     False
22     False
23     False
24     False
25     False
26     False
27      True
28     False
29      True
30      True
       ...  
862    False
863    False
864     True
865    False
866    False
867    False
868    False
869     True
870    False
871    False
872    False
873    False
874    False
875    False
876    False
877    False
878    False
879     True
880    False
881    False
882    False
883    False
884    False
885    False
886    False
887    False
888    False
889     True
890    False
891    False
Name: Age, Length: 891, dtype: bool

In [8]:
list = []
for i in range (0, len(titanic)):
    age = titanic.iloc[i].Age
    age = 'child' if age < 20 else 'adult' if age >= 20 else 'unknown'
    list.append(age)

In [9]:
list

['adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'child',
 'adult',
 'child',
 'unknown',
 'adult',
 'unknown',
 'adult',
 'adult',
 'child',
 'adult',
 'child',
 'adult',
 'unknown',
 'child',
 'unknown',
 'unknown',
 'adult',
 'unknown',
 'unknown',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'unknown',
 'child',
 'child',
 'unknown',
 'unknown',
 'unknown',
 'unknown',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'adult',
 'child',
 'unknown',
 'unknown',
 'adult',
 'child',
 'child',
 'adult',
 'adult',
 'child',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknown',
 'unknown',
 'child',
 'adult',
 'adult',
 'adult',
 'unknown',
 'adult',
 'child',
 'adult',
 'child',
 'unknown',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'adult',
 'unknow

In [10]:
titanic['Age_modified'] = list

In [11]:
titanic.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult


In [12]:
Age_dummies = pd.get_dummies(titanic.Age_modified, prefix = 'Age')
Age_dummies.sample(n = 10)

Unnamed: 0_level_0,Age_adult,Age_child,Age_unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
195,1,0,0
80,1,0,0
222,1,0,0
47,0,0,1
530,1,0,0
27,0,0,1
405,1,0,0
593,1,0,0
22,1,0,0
263,1,0,0


In [13]:
Embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = 'Embarked')
Embarked_dummies.sample(n = 10)

Unnamed: 0_level_0,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
119,1,0,0
684,0,0,1
334,0,0,1
163,0,0,1
670,0,0,1
97,1,0,0
866,0,0,1
855,0,0,1
570,0,0,1
839,0,0,1


In [14]:
Sex_dummies = pd.get_dummies(titanic.Sex, prefix = 'Sex')
Sex_dummies.sample (n = 10)

Unnamed: 0_level_0,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
323,1,0
742,0,1
625,0,1
671,1,0
339,0,1
47,0,1
376,1,0
768,1,0
693,0,1
764,1,0


In [15]:
data = pd.concat([titanic, Age_dummies, Embarked_dummies, Sex_dummies], axis = 1)

In [16]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_modified,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,adult,1,0,0,0,0,1,0,1
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,adult,1,0,0,1,0,0,1,0
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,adult,1,0,0,0,0,1,1,0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,adult,1,0,0,0,0,1,1,0
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,adult,1,0,0,0,0,1,0,1


In [17]:
data = data.drop(['Name', 'Sex', 'Age', 'Age_modified', 'Ticket', 'Fare', 'Cabin', 'Embarked'], axis = 1)

In [18]:
data.head(5)

Unnamed: 0_level_0,Survived,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,1,0,1,0,0,0,0,1,0,1
2,1,1,1,0,1,0,0,1,0,0,1,0
3,1,3,0,0,1,0,0,0,0,1,1,0
4,1,1,1,0,1,0,0,0,0,1,1,0
5,0,3,0,0,1,0,0,0,0,1,0,1


In [19]:
# 변수명 가져오기
col_names = data.columns.values

In [20]:
X = data[col_names[1:]]
Y = data[col_names[0]]

In [21]:
X.head(5)

Unnamed: 0_level_0,Pclass,SibSp,Parch,Age_adult,Age_child,Age_unknown,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,3,1,0,1,0,0,0,0,1,0,1
2,1,1,0,1,0,0,1,0,0,1,0
3,3,0,0,1,0,0,0,0,1,1,0
4,1,1,0,1,0,0,0,0,1,1,0
5,3,0,0,1,0,0,0,0,1,0,1


In [22]:
Y.head(5)

PassengerId
1    0
2    1
3    1
4    1
5    0
Name: Survived, dtype: int64

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=123)

In [24]:
# C가 클수록 weak regularization
C_set = [0.1, 1, 10, 1e2, 1e3, 1e4, 1e5, 1e6]

In [25]:
result = []

In [26]:
for C in C_set:
    logreg = LogisticRegression(penalty = 'l2', dual=True, C = C, class_weight='balanced')
    Y_score = logreg.fit(X_train, Y_train).decision_function(X_test)
    fpr, tpr, _ = roc_curve(Y_test, Y_score)
    result.append(auc(fpr, tpr))

In [27]:
result

[0.85084789311408027,
 0.85264645426515928,
 0.85427372387804046,
 0.85427372387804035,
 0.84853545734840696,
 0.82446899623158609,
 0.81761733470366571,
 0.83979958890030837]

In [28]:
max(result)

0.85427372387804046