In [1214]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn import linear_model
import random
import csv
from sklearn import svm
from scipy.stats import mode
import string
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import sklearn.ensemble as ske
from patsy import dmatrices
from sklearn.metrics import accuracy_score

In [1215]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        #if string.find(big_string, substring) != -1:
        if big_string.find(substring) != -1:
            return substring
    print(big_string)
    return np.nan

In [1216]:
def phase1clean(df):
    #setting silly values to nan
    df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)
    
    #Special case for cabins as nan may be signal
    df.Cabin = df.Cabin.fillna('Unknown')    

    #creating a title column from name
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']

    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    #replacing all titles with mr, mrs, miss, master
    def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
    df['Title']=df.apply(replace_titles, axis=1)

    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
        
    #Creating new family_size column
    df['Family_Size']=df['SibSp']+df['Parch']
    
    return df

In [1217]:
def phase2clean(train, test):
    #data type dictionary
    data_type_dict={'Pclass':'ordinal', 'Sex':'nominal', 
                    'Age':'numeric', 
                    'Fare':'numeric', 'Embarked':'nominal', 'Title':'nominal',
                    'Deck':'nominal', 'Family_Size':'ordinal'}      

    #imputing nan values
    for df in [train, test]:
        #classmeans = df.pivot_table('Fare', rows='Pclass', aggfunc='mean')
        classmeans = df.pivot_table('Fare', columns='Pclass', aggfunc='mean')
        df.Fare = df[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )
        meanAge=np.mean(df.Age)
        df.Age=df.Age.fillna(meanAge)
#         modeEmbarked = mode(df.Embarked)[0][0]
#         df.Embarked = df.Embarked.fillna(modeEmbarked)


#    Fare per person
    for df in [train, test]:
        df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
    
    #Age times class
    for df in [train, test]:
        df['Age*Class']=df['Age']*df['Pclass']
    
    data_type_dict['Fare_Per_Person']='numeric'
    data_type_dict['Age*Class']='numeric'
    
    return [train,test, data_type_dict]

In [1218]:
trainpath = 'train.csv'
testpath = 'test.csv'
traindf = pd.read_csv(trainpath)
testdf = pd.read_csv(testpath)

In [1219]:
traindf=phase1clean(traindf)
testdf=phase1clean(testdf)

In [1220]:
traindf.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Family_Size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,Mr,Unknown,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,1


In [1221]:
traindf, testdf, data_type_dict=phase2clean(traindf, testdf)

In [1222]:
traindf.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Family_Size,Fare_Per_Person,Age*Class
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,Mr,Unknown,1,3.625,66.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,1,35.6416,38.0


In [1223]:
surv = traindf.groupby('Survived')['PassengerId'].count()
print(surv)

Survived
0    549
1    342
Name: PassengerId, dtype: int64


In [1224]:
traindf.groupby(['Survived','Pclass'])['PassengerId'].count()

Survived  Pclass
0         1          80
          2          97
          3         372
1         1         136
          2          87
          3         119
Name: PassengerId, dtype: int64

In [1225]:
traindf.groupby(['Survived','Pclass','Sex'])['PassengerId'].count()

Survived  Pclass  Sex   
0         1       female      3
                  male       77
          2       female      6
                  male       91
          3       female     72
                  male      300
1         1       female     91
                  male       45
          2       female     70
                  male       17
          3       female     72
                  male       47
Name: PassengerId, dtype: int64

In [1226]:
traindf.groupby(['Survived','Title'])['PassengerId'].count()

Survived  Title 
0         Master     17
          Miss       54
          Mr        447
          Mrs        31
1         Master     23
          Miss      128
          Mr         84
          Mrs       107
Name: PassengerId, dtype: int64

In [1229]:
weight = surv[0]/surv[1]
print(weight)

1.60526315789


In [1230]:
testdf.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Family_Size,Fare_Per_Person,Age*Class
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,Q,Mr,Unknown,0,7.8292,103.5
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Unknown,S,Mrs,Unknown,1,3.5,141.0


In [1231]:
print(data_type_dict)

{'Age*Class': 'numeric', 'Pclass': 'ordinal', 'Age': 'numeric', 'Title': 'nominal', 'Embarked': 'nominal', 'Fare': 'numeric', 'Sex': 'nominal', 'Deck': 'nominal', 'Family_Size': 'ordinal', 'Fare_Per_Person': 'numeric'}


In [1232]:
traindf.groupby('Title')['PassengerId'].count()

Title
Master     40
Miss      182
Mr        531
Mrs       138
Name: PassengerId, dtype: int64

In [1233]:
def genderclass(df):
    sexCate = []
    for item in list(df['Sex']):
        if item == 'female':
            sexCate.append(1)
        elif item == 'male':
            sexCate.append(2)
        else:
            sexCate.append(0)
    gencla = []        
    for i in range(len(sexCate)):
        temp = int(sexCate[i])*int(df['Pclass'][i])
        gencla.append(temp)
    return gencla

In [1234]:
traindf['genderclass'] = genderclass(traindf)

In [1236]:
testdf['genderclass'] = genderclass(testdf)

In [1237]:
def genderclassage(df):
    genclage = []        
    for i in range(df.shape[0]):
        temp = float(df['Age'][i])*float(df['genderclass'][i])
        genclage.append(temp)
    return genclage

In [1238]:
traindf['genderclassage'] = genderclassage(traindf)
testdf['genderclassage'] = genderclassage(testdf)

In [1239]:
traindf.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Deck,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,Unknown,S,Mr,Unknown,1,3.625,66.0,6,132.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,C,1,35.6416,38.0,1,38.0


In [1240]:
traindf['genderclassage'].quantile([0.25,0.5,0.75])

0.25     57.0
0.50     96.0
0.75    168.0
Name: genderclassage, dtype: float64

In [1273]:
def getgenclageCate(df):
    genclageCate=[]
    quan = list(df['genderclassage'].quantile([0.25,0.5,0.75]))
    for item in df['genderclassage']:
        if item < quan[0]:
            genclageCate.append('A')
        elif item < quan[1]:
            genclageCate.append('B')
        elif item < quan[2]:
            genclageCate.append('C')
        else:
            genclageCate.append('D')
    return genclageCate

In [1275]:
traindf['genclageCate'] = getgenclageCate(traindf)
testdf['genclageCate'] = getgenclageCate(testdf)

In [1277]:
traindf.groupby(['Survived','genclageCate'])['PassengerId'].count()

Survived  genclageCate
0         A                50
          B               107
          C               193
          D               199
1         A               172
          B               103
          C                42
          D                25
Name: PassengerId, dtype: int64

In [1278]:
traindf.groupby(['Survived','genderclass'])['PassengerId'].count()

Survived  genderclass
0         1                3
          2               83
          3               72
          4               91
          6              300
1         1               91
          2              115
          3               72
          4               17
          6               47
Name: PassengerId, dtype: int64

In [1279]:
def getgenclaCate(df):
    genclaCate=[]
    for item in df['genderclass']:
        if item==1:
            genclaCate.append('A')
        elif item==2:
            genclaCate.append('B')
        elif item==3:
            genclaCate.append('C')
        else:
            genclaCate.append('D')
    return genclaCate

In [1280]:
traindf['genclaCate'] = getgenclaCate(traindf)
testdf['genclaCate'] = getgenclaCate(testdf)

In [1281]:
traindf.groupby(['Survived','genclaCate'])['PassengerId'].count()

Survived  genclaCate
0         A               3
          B              83
          C              72
          D             391
1         A              91
          B             115
          C              72
          D              64
Name: PassengerId, dtype: int64

In [1282]:
traindf.groupby(['Survived','Title'])['PassengerId'].count()

Survived  Title 
0         Master     17
          Miss       54
          Mr        447
          Mrs        31
1         Master     23
          Miss      128
          Mr         84
          Mrs       107
Name: PassengerId, dtype: int64

In [1351]:
df = traindf
df_test = testdf

#### Force `NaN` to `Unknown`

In [1284]:
df.Embarked = df.Cabin.fillna('Unknown')  
df_test.Embarked = df_test.fillna('Unknown')

In [1285]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Deck,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage,genclaCate,AgeCate,FareCate,genclageCate
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,Unknown,1,3.625,66.0,6,132.0,D,B,A,C
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,1,35.6416,38.0,1,38.0,A,G,H,A


In [1286]:
df_test.shape

(418, 22)

In [1287]:
df_test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,Deck,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage,genclaCate,AgeCate,FareCate,genclageCate
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Unknown,...,Unknown,0,7.8292,103.5,6,207.0,D,F,B,D
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,Unknown,...,Unknown,1,3.5,141.0,3,141.0,C,H,A,C


In [1288]:
df.groupby('Pclass')['PassengerId'].count()

Pclass
1    216
2    184
3    491
Name: PassengerId, dtype: int64

In [1289]:
df.groupby('SibSp')['PassengerId'].count()

SibSp
0    608
1    209
2     28
3     16
4     18
5      5
8      7
Name: PassengerId, dtype: int64

In [1290]:
df.groupby('Parch')['PassengerId'].count()

Parch
0    678
1    118
2     80
3      5
4      4
5      5
6      1
Name: PassengerId, dtype: int64

In [1291]:
df.groupby('Sex')['PassengerId'].count()

Sex
female    314
male      577
Name: PassengerId, dtype: int64

In [1292]:
AgeQuan = list(df['Age'].quantile([0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]))
print(AgeQuan)

[17.0, 22.0, 27.0, 29.69911764705882, 29.69911764705882, 35.0, 45.0, 80.0]


In [1293]:
def getAgeCategory(ageList):
    print(len(ageList))
    ageCate = []
    for age in (ageList):
        if age<=AgeQuan[0]:
            ageCate.append('A')
        elif age<=AgeQuan[1]:
            ageCate.append('B')
        elif age<=AgeQuan[2]:
            ageCate.append('C')
        elif age<=AgeQuan[3]:
            ageCate.append('D')
        elif age<=AgeQuan[4]:
            ageCate.append('E')
        elif age<=AgeQuan[5]:
            ageCate.append('F')
        elif age<=AgeQuan[6]:
            ageCate.append('G')
        elif age<=AgeQuan[7]:
            ageCate.append('H')
        else:
            ageCate.append('I')
    return ageCate

In [1294]:
df['AgeCate']  = getAgeCategory(df['Age'])
df_test['AgeCate']  = getAgeCategory(df_test['Age'])

891
418


In [1295]:
df.groupby(['Survived','AgeCate'])['PassengerId'].count()

Survived  AgeCate
0         A           52
          B           81
          C           63
          D          157
          F           62
          G           69
          H           65
1         A           61
          B           37
          C           43
          D           67
          F           51
          G           45
          H           38
Name: PassengerId, dtype: int64

In [1296]:
df_test.groupby('AgeCate')['PassengerId'].count()

AgeCate
A     41
B     61
C     65
D     18
F    128
G     53
H     52
Name: PassengerId, dtype: int64

In [1297]:
FareCate = list(df['Fare'].quantile([0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]))
print(FareCate)

[7.75, 7.9249999999999998, 10.5, 14.5, 25.671874999999996, 31.274999999999999, 69.549999999999997, 512.32920000000001]


In [1298]:
len(FareCate)

8

In [1299]:
def getFareCate(fareList):
    fareCate = []
    print(len(list(fareList)))
    for fare in list(fareList):
        if fare<FareCate[0]:
            fareCate.append('A')
        elif fare<FareCate[1]:
            fareCate.append('B')
        elif fare<FareCate[2]:
            fareCate.append('C')
        elif fare<FareCate[3]:
            fareCate.append('D')
        elif fare<FareCate[4]:
            fareCate.append('E')
        elif fare<FareCate[5]:
            fareCate.append('F')
        elif fare<FareCate[6]:
            fareCate.append('G')
        elif fare<FareCate[7]:
            fareCate.append('H')
        else:
            fareCate.append('I')
    return fareCate

In [1300]:
df['FareCate']  = getFareCate(df['Fare'])
df_test['FareCate']  = getFareCate(df_test['Fare'])

891
418


In [1301]:
df.groupby(['Survived','FareCate'])['PassengerId'].count()

Survived  FareCate
0         A           75
          B           90
          C           93
          D           69
          E           70
          F           54
          G           57
          H           41
1         A           16
          B           27
          C           23
          D           46
          E           48
          F           53
          G           53
          H           73
          I            3
Name: PassengerId, dtype: int64

In [1302]:
df_test.groupby('FareCate')['PassengerId'].count()

FareCate
A    47
B    65
C    41
D    57
E    54
F    44
G    53
H    56
I     1
Name: PassengerId, dtype: int64

In [1303]:
label = df['Survived']

In [1304]:
passId = df_test['PassengerId']

In [1305]:
delList = ['Name', 'Ticket','Cabin', 'Survived', 'Embarked','PassengerId']

In [1306]:
for col in df:
    if col in delList:
        del df[col]

In [1307]:
for col in df_test:
    if col in delList:
        del df_test[col]

In [1308]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Deck,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage,genclaCate,AgeCate,FareCate,genclageCate
0,3,male,22.0,1,0,7.25,Mr,Unknown,1,3.625,66.0,6,132.0,D,B,A,C
1,1,female,38.0,1,0,71.2833,Mrs,C,1,35.6416,38.0,1,38.0,A,G,H,A
2,3,female,26.0,0,0,7.925,Miss,Unknown,0,7.925,78.0,3,78.0,C,C,C,B
3,1,female,35.0,1,0,53.1,Mrs,C,1,26.55,35.0,1,35.0,A,F,G,A
4,3,male,35.0,0,0,8.05,Mr,Unknown,0,8.05,105.0,6,210.0,D,F,C,D


In [1309]:
df_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title,Deck,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage,genclaCate,AgeCate,FareCate,genclageCate
0,3,male,34.5,0,0,7.8292,Mr,Unknown,0,7.8292,103.5,6,207.0,D,F,B,D
1,3,female,47.0,1,0,7.0,Mrs,Unknown,1,3.5,141.0,3,141.0,C,H,A,C
2,2,male,62.0,0,0,9.6875,Mr,Unknown,0,9.6875,124.0,4,248.0,D,H,C,D
3,3,male,27.0,0,0,8.6625,Mr,Unknown,0,8.6625,81.0,6,162.0,D,C,C,D
4,3,female,22.0,1,1,12.2875,Mrs,Unknown,2,4.09583,66.0,3,66.0,C,B,D,B


In [1310]:
df['Fare'] = df['Fare'].apply(lambda x: float(x))
df['Fare_Per_Person'] = df['Fare_Per_Person'].apply(lambda x: float(x))

In [1311]:
df_test['Fare'] = df_test['Fare'].apply(lambda x: float(x))
df_test['Fare_Per_Person'] = df_test['Fare_Per_Person'].apply(lambda x: float(x))

In [1312]:
def logit_cv(data,label,weig):
    logreg = linear_model.LogisticRegression(fit_intercept=True, C=1e5)
    kf = KFold(n_splits=10, shuffle = True)
    folds = kf.split(data)
    score = []
    for train,test in folds:
        data_train, data_test = data[train],data[test]
        label_train, label_test = label[train],label[test]
        weight = np.ones(len(label_train))
        weight[np.where(label_train>0)]= 1 #weig=1.6
        logreg.fit(data_train, label_train, sample_weight=weight)
        Z = logreg.score(data_test, label_test)
        score.append(Z)
    return [np.mean(score),logreg]

In [1313]:
def forward_fea_select_logit(data, label, n, weig):
    colist = list(data)
    selist = []
    left = len(colist)-n
    cm = []
    maxlist = []
    maxacc = 0
    while (len(colist)>left):
        maxfeaval = 0
        temp = 0
        maxfea = ''
        for col in colist:
            selist.append(col)
            fea = preprocessing.scale(data[selist])
            selist.remove(col)
            [temp,logreg] = logit_cv(fea,label,weig)
            if temp>maxfeaval:
                maxfeaval = temp
                maxfea = col
        colist.remove(maxfea)
        selist.append(maxfea)
        if maxfeaval>maxacc:
            maxacc = maxfeaval
            maxlist.append(maxfea)
        print('Feature List:',maxlist)
        print('Accuracy: %.2f'%maxacc)
    return [maxlist,logreg]

In [1314]:
df_dummy = pd.get_dummies(df)
df_dummy.shape

(891, 49)

In [1315]:
print(list(df_dummy))

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_Size', 'Fare_Per_Person', 'Age*Class', 'genderclass', 'genderclassage', 'Sex_female', 'Sex_male', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown', 'genclaCate_A', 'genclaCate_B', 'genclaCate_C', 'genclaCate_D', 'AgeCate_A', 'AgeCate_B', 'AgeCate_C', 'AgeCate_D', 'AgeCate_F', 'AgeCate_G', 'AgeCate_H', 'FareCate_A', 'FareCate_B', 'FareCate_C', 'FareCate_D', 'FareCate_E', 'FareCate_F', 'FareCate_G', 'FareCate_H', 'FareCate_I', 'genclageCate_A', 'genclageCate_B', 'genclageCate_C', 'genclageCate_D']


In [1342]:
data1 = preprocessing.scale(df_dummy)

In [1343]:
logit_cv(data1,label,weig=1)

[0.82828963795255939,
 LogisticRegression(C=100000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False)]

In [1347]:
N = 40
[selist,logreg] = forward_fea_select_logit(df_dummy, label, N, weight)

Feature List: ['Title_Mr']
Accuracy: 0.79
Feature List: ['Title_Mr', 'SibSp']
Accuracy: 0.82
Feature List: ['Title_Mr', 'SibSp', 'Deck_E']
Accuracy: 0.82
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F']
Accuracy: 0.82
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I']
Accuracy: 0.83
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage']
Accuracy: 0.83
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage', 'genclaCate_C']
Accuracy: 0.83
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage', 'genclaCate_C', 'genclageCate_A']
Accuracy: 0.84
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage', 'genclaCate_C', 'genclageCate_A', 'Sex_male']
Accuracy: 0.84
Feature List: ['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage', 'genclaCate_C', 'genclageCate_A', 'Sex_male', 'Deck_D']
Accuracy: 0.84
Fe

In [1348]:
logreg = linear_model.LogisticRegression(fit_intercept=True, C=1e5)
data = preprocessing.scale(df_dummy[selist])
logreg.fit(data, label)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [1349]:
Z = logreg.predict(data)
accuracy_score(Z, label)

0.84175084175084181

In [1353]:
df_test_dummy = pd.get_dummies(df_test)
data_test = preprocessing.scale(df_test_dummy[selist])

In [1355]:
res_logit = logreg.predict(data_test)
print(res_logit)

[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 0 1 1
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0
 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


#### Random Forest

In [1363]:
def randomf_cv(data, label):
    clf = RandomForestClassifier(max_depth=100, random_state=0)
    kf = KFold(n_splits=10, shuffle = True)
    folds = kf.split(data)
    score = []
    for train,test in folds:
        data_train, data_test = data[train],data[test]
        label_train, label_test = label[train],label[test]
        clf.fit(data_train, label_train)
        Z = clf.score(data_test, label_test)
        score.append(Z)
    return [np.mean(score),clf]

In [1364]:
print(selist)

['Title_Mr', 'SibSp', 'Deck_E', 'AgeCate_F', 'FareCate_I', 'genderclassage', 'genclaCate_C', 'genclageCate_A', 'Sex_male', 'Deck_D', 'AgeCate_B', 'Age*Class']


In [1365]:
df1 = df_dummy

In [1366]:
# data = preprocessing.scale(df_dummy)
# [res, clf] = randomf_cv(data, label)
# print(res)

In [1367]:
df1['Survived'] = label

In [1368]:
df1.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Family_Size,Fare_Per_Person,Age*Class,genderclass,genderclassage,...,FareCate_E,FareCate_F,FareCate_G,FareCate_H,FareCate_I,genclageCate_A,genclageCate_B,genclageCate_C,genclageCate_D,Survived
0,3,22.0,1,0,7.25,1,3.625,66.0,6,132.0,...,0,0,0,0,0,0,0,1,0,0
1,1,38.0,1,0,71.2833,1,35.64165,38.0,1,38.0,...,0,0,0,1,0,1,0,0,0,1
2,3,26.0,0,0,7.925,0,7.925,78.0,3,78.0,...,0,0,0,0,0,0,1,0,0,1
3,1,35.0,1,0,53.1,1,26.55,35.0,1,35.0,...,0,0,1,0,0,1,0,0,0,1
4,3,35.0,0,0,8.05,0,8.05,105.0,6,210.0,...,0,0,0,0,0,0,0,0,1,0


In [1372]:
form = ''
selist.remove('Age*Class')
for fea in selist[:-1]:
    form = form + str(fea)+' + '
form = form + str(selist[-1])
print(form)

Title_Mr + SibSp + Deck_E + AgeCate_F + FareCate_I + genderclassage + genclaCate_C + genclageCate_A + Sex_male + Deck_D + AgeCate_B


In [1373]:
formula_ml = 'Survived ~ '+form
print(formula_ml)

Survived ~ Title_Mr + SibSp + Deck_E + AgeCate_F + FareCate_I + genderclassage + genclaCate_C + genclageCate_A + Sex_male + Deck_D + AgeCate_B


In [796]:
# formula_ml = 'Survived ~ Title_Mr + SibSp + Deck_E + Sex_male + Sex_female + FareCate_A + FareCate_B + AgeCate_H + FareCate_I + Age'

In [797]:
#formula_ml = 'Survived ~ Pclass + Age + Fare + Family_Size + Sex_female + Fare_Per_Person'

In [1103]:
#feaList = ['Pclass', 'Age', 'Fare', 'Family_Size', 'Sex_female', 'Fare_Per_Person']

In [1374]:
# Create the random forest model and fit the model to our training data
y, x = dmatrices(formula_ml, data=df1, return_type='dataframe')
# RandomForestClassifier expects a 1 demensional NumPy array, so we convert
y = np.asarray(y).ravel()
#instantiate and fit our model
results_rf = ske.RandomForestClassifier(n_estimators=100).fit(x, y)
# Score the results
score = results_rf.score(x, y)
print("Mean accuracy of Random Forest Predictions on the data was: {0}".format(score))

Mean accuracy of Random Forest Predictions on the data was: 0.9124579124579124


In [1375]:
print(x.head())

   Intercept  Title_Mr  SibSp  Deck_E  AgeCate_F  FareCate_I  genderclassage  \
0        1.0       1.0    1.0     0.0        0.0         0.0           132.0   
1        1.0       0.0    1.0     0.0        0.0         0.0            38.0   
2        1.0       0.0    0.0     0.0        0.0         0.0            78.0   
3        1.0       0.0    1.0     0.0        1.0         0.0            35.0   
4        1.0       1.0    0.0     0.0        1.0         0.0           210.0   

   genclaCate_C  genclageCate_A  Sex_male  Deck_D  AgeCate_B  
0           0.0             0.0       1.0     0.0        1.0  
1           0.0             1.0       0.0     0.0        0.0  
2           1.0             0.0       0.0     0.0        0.0  
3           0.0             1.0       0.0     0.0        0.0  
4           0.0             0.0       1.0     0.0        0.0  


In [1376]:
df2 = df_test_dummy[selist]

In [1377]:
df2.insert(0, 'Intercept', 1)
# df2['Intercept'] = 1

In [1378]:
df2.head()

Unnamed: 0,Intercept,Title_Mr,SibSp,Deck_E,AgeCate_F,FareCate_I,genderclassage,genclaCate_C,genclageCate_A,Sex_male,Deck_D,AgeCate_B
0,1,1,0,0,1,0,207.0,0,0,1,0,0
1,1,0,1,0,0,0,141.0,1,0,0,0,0
2,1,1,0,0,0,0,248.0,0,0,1,0,0
3,1,1,0,0,0,0,162.0,0,0,1,0,0
4,1,0,1,0,0,0,66.0,1,0,0,0,1


In [1379]:
res = []
res = list(results_rf.predict(df2))
res_ran = [int(i) for i in res]
print(res_ran)

[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

#### SVM RBF Kernel

In [1329]:
def svm_cv(data,label):
    C = 1#1.0 
    clf1 = svm.SVC(kernel='rbf', gamma=0.7, C=C)
    clf2 = svm.SVC(kernel='poly', degree=3, C=C)
    clf3 = svm.SVC(kernel='linear', C=C)
    clf = LogisticRegression(C=C, penalty='l1', tol=0.01)
    
    kf = KFold(n_splits=5, shuffle = True)
    folds = kf.split(data)
    score = []
    for train,test in folds:
        data_train, data_test = data[train],data[test]
        label_train, label_test = label[train],label[test]
        clf1.fit(data_train, label_train)
        clf2.fit(data_train, label_train)
        clf3.fit(data_train, label_train)
        clf.fit(data_train, label_train)
        
        res1 = clf1.predict(data_test)
        res2 = clf2.predict(data_test)
        res3 = clf3.predict(data_test)
        res = clf.predict(data_test)
        
        #pred = (res1+res2+res3+res)/4.0
        pred = res
        pred = [int(i) for i in pred]
        print(pred[:5])
#         Z = clf.score(data_test, label_test)
#         score.append(Z)
        Z = accuracy_score(label_test, pred)
        print(Z)
        score.append(Z)
        model = [clf1, clf2, clf3, clf]
    return [np.mean(score),model]

In [1113]:
#C = 1.0  # SVM regularization parameter
# models = (svm.SVC(kernel='linear', C=C),
#           svm.LinearSVC(C=C),
#           svm.SVC(kernel='rbf', gamma=0.7, C=C),
#           svm.SVC(kernel='poly', degree=3, C=C))
# models = (clf.fit(X, y) for clf in models)
#clf = svm.SVC(kernel='rbf', gamma=0.7, C=C)

In [1330]:
if 'Survived' in list(df_dummy):
    del df_dummy['Survived']

In [1331]:
#data = preprocessing.scale(df_dummy[selist])
data = preprocessing.scale(df_dummy)

In [1332]:
[res, model] = svm_cv(data, label)

[0, 1, 0, 1, 0]
0.793296089385
[1, 0, 0, 0, 0]
0.814606741573
[1, 0, 1, 0, 0]
0.887640449438
[0, 1, 0, 0, 0]
0.803370786517
[1, 1, 0, 0, 0]
0.803370786517


In [1333]:
print(res)

0.820456970686


#### Ensembling method

In [1334]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [1335]:
# data = preprocessing.scale(df_dummy[selist])
if 'Survived' in list(df_dummy):
    del df_dummy['Survived']
data = preprocessing.scale(df_dummy[selist])

In [1336]:
df_dummy[selist].head()

Unnamed: 0,Title_Mr,SibSp,Pclass,genclaCate_C,Fare,Age*Class,FareCate_I,FareCate_C
0,1,1,3,0,7.25,66.0,0,0
1,0,1,1,0,71.2833,38.0,0,0
2,0,0,3,1,7.925,78.0,0,1
3,0,1,1,0,53.1,35.0,0,0
4,1,0,3,0,8.05,105.0,0,1


In [1337]:
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, data, label)
scores.mean()  

0.8047138047138046

In [1338]:
print(label[:5])

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [1339]:
data.shape

(891, 8)

In [1340]:
X, y = data, label
C = 1.0
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
#clf1 = svm.SVC(kernel='rbf', gamma=0.7, C=C)
# clf2 = svm.SVC(kernel='poly', degree=3, C=C)
# clf3 = svm.SVC(kernel='linear', C=C)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, method_label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), method_label))

Accuracy: 0.82 (+/- 0.03) [Logistic Regression]
Accuracy: 0.81 (+/- 0.03) [Random Forest]
Accuracy: 0.63 (+/- 0.02) [naive Bayes]
Accuracy: 0.82 (+/- 0.02) [Ensemble]


In [1341]:
eclf.fit(X,y)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFore...=False, random_state=1,
            verbose=0, warm_start=False)), ('gnb', GaussianNB(priors=None))],
         n_jobs=1, voting='hard', weights=None)

In [1127]:
data_test = preprocessing.scale(df_test_dummy[selist])

In [1128]:
data_test.shape

(418, 8)

In [1129]:
data.shape

(891, 8)

In [1130]:
print(list(df_test_dummy[selist]))

['Title_Mr', 'SibSp', 'Deck_E', 'FareCate_I', 'AgeCate_C', 'AgeCate_F', 'FareCate_C', 'Sex_male']


In [1131]:
print(list(df_dummy[selist]))

['Title_Mr', 'SibSp', 'Deck_E', 'FareCate_I', 'AgeCate_C', 'AgeCate_F', 'FareCate_C', 'Sex_male']


In [1132]:
result2 = list(eclf.predict(data_test))
print(result2)

[0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

#### Voting Classifier with GridSearch

In [860]:
from sklearn.model_selection import GridSearchCV
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(data, label)

In [861]:
scores = cross_val_score(grid, data, label)

In [862]:
print(scores)

[ 0.77777778  0.8047138   0.80808081]


#### Predict test data

In [388]:
# df_test_dummy = pd.get_dummies(df_test)
# data_test = preprocessing.scale(df_test_dummy[selist])
# res = list(logreg.predict(data_test))
# print(res)

In [1380]:
res = res_ran
# res = result2
# res = list(res_logit)

In [1381]:
passId = list(passId)
print(passId)

['PassengerId', 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 

In [1382]:
if passId[0]!="PassengerId":
    passId.insert(0, "PassengerId")
if res[0]!="Survived":
    res.insert(0, "Survived")
result = [passId, res]
print(result[0][:10])
print(result[1][:10])

['PassengerId', 892, 893, 894, 895, 896, 897, 898, 899, 900]
['Survived', 0, 0, 1, 1, 0, 0, 0, 0, 1]


In [1383]:
with open('predict.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(np.transpose(result))