In [None]:
# !pip install kaggle
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle competitions download -c titanic
# !ls

In [None]:
# The usuals
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Regular expressions
import re

from sklearn import preprocessing

# LightGBM\XGBoost\GBDT
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

# Sklearn tools for model training and assessment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import cross_val_score, ShuffleSplit

from sklearn.ensemble import VotingClassifier

In [None]:
trainRaw = pd.read_csv('train.csv')
testRaw = pd.read_csv('test.csv')

# Concatenate together
TrainRow = trainRaw.shape[0]
allsets = pd.concat([trainRaw, testRaw], axis=0,sort=True)

In [None]:
# Build new features from Cabin
# Seperate Canbin to Letter and Number for who has Canbin
def CabSplit(s):
    """
    Function to try and extract cabin letter and number from the cabin column.
    Runs a regular expression that finds letters and numbers in the
    string. These are held in match.group, if they exist.
    """
    match = re.match(r"([a-z]+)([0-9]+)",s,re.I) #re.I 使匹配对大小写不敏感

    try:
        letter = match.group(1) #满足([a-z]+)的部分
    except:
        letter = ''

    try:
        number = match.group(2) #满足([0-9]+)的部分
    except:
        number = 9999
    return letter, number

# Count the number of Canbin
def DR(s):
    """
    From the cabin string, try and extract letter, number, and number of cabins
    """
    # Check contents
    if isinstance(s, (int,float)): 
      # 若s是int或者float(只有数字形式) 
      # 说明没有Cabin信息 为空 
       letter = ''
       number = ''
       nCabins = 9999
    else:
       # If field isn't empty, split string to letter and unmber. 
       # Some strings contain multiple cabins.
       s = s.split(' ')
       # Count the cabins based on number of splits
       nCabins = len(s)
       # Just take first cabin for letter/number extraction
       s = s[0]

       letter, number = CabSplit(s)

    return [letter, number, nCabins]

# Apply DR function to each cell in Cabin column using pandas apply method.
out = allsets['Cabin'].apply(DR)

# Output tuple with 3 values for each row, convert this to pandas df
out = out.apply(pd.Series)

# name the columns
out.columns = ['Cabletter','Cabnumber','Cabcount']

# Concatenate these columns to the dataset
allsets = pd.concat([allsets,out],axis = 1)      


In [None]:
# Add some family features directly to new columns in the dataset

#size
allsets['fsize'] = allsets['SibSp'] + allsets['Parch'] + 1

#Ratio
allsets['fRatio'] = (allsets['Parch'] + 1)/(allsets['SibSp'] + 1)

#Adult
allsets['Adult'] = allsets['Age'] > 18

In [None]:
# Extract titles from Name column, standardise
titleDict = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Sir",
    "Don": "Sir",
    "Sir": "Sir",
    "Dr": "Dr",
    "Rev": "Rev",
    "theCountess": "Lady",
    "Dona": "Lady",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Lady"
}

def NameSplit(s, titleDict):
    """
    Extract title from name, replace with value in title dictionary. Also
    return surname.
    """

    # Remove '.' from name string
    s = s.replace('.','')
    # Split on spaces
    s = s.split(' ')
    # get surname
    surname = s[0]

    #get title - loop over titleDict
    #if s matches a key, take the corresponding value as the tile
    title = []
    for k, t in titleDict.items():
      if str(k) in s:
        title.append(t)
        
    if title == []:
        title = 'other'
    else:
        # Title is a list, so extract contents
        title = title[0]
    return surname.strip(','), title

out = allsets['Name'].apply(NameSplit,args=[titleDict])

out = out.apply(pd.Series)
out.columns = ['Surname','Title']
allsets = pd.concat([allsets, out],axis=1)


In [None]:
TicketDict = {
    "A./5.": "A",
    "A.5.": "A",
    "A/4": "A",
    "A/4.": "A",
    "A/5": "A",
    "A/5.": "A",
    "A/S": "A",
    "A4.": "A",
    "C": "C",
    "C.A.":"C",
    "CA.": "C",
    "C.A./": "C",
    "CA": "C",
    "F.C.": "FC",
    "F.C.C.": "FC",
    "LINE":"LINE",
    "PC": "PC",
    "PP": "PC",
    "SOTON/O.Q.": "SO",
    "SOTON/O2": "SO",
    "SOTON/OQ": "SO",
    "STON/O" : "ST",
    "STON/O2.": "ST"
}

def splitTic(s):

    match = re.match(r"([0-9]+)", s, re.I)

    try:
        number = int(match.group(1))
    except:
        number = s

    return number


def SP(s, TicketDict):
    s = splitTic(s)
    if isinstance(s, (int)):
       if s <= 9999:
          tic = 'a'
       if ((s >= 10000) and (s < 99999)):
          tic = 'b'
       if ((s >= 100000) and (s < 999999)):
          tic = 'c'
       else:
          tic = 'd'
    else:
       s = s.split(' ')
      
       tic = [t for k, t in TicketDict.items() if str(k) in s]
      
       if tic == []:
        tic = 'Other'
       else:
        # Title is a list, so extract contents
        tic = tic[0]
      
    return tic

# out = allsets['Ticket'].apply(splitTic, args=[TicketDict])
out = allsets['Ticket'].apply(SP, args = [TicketDict])
out = out.apply(pd.Series)
out.columns = ['Tickettype']

allsets = pd.concat([allsets,out],axis=1)


In [None]:
def Farelevel(s):
    if s <= 10:
       fee = 'aa'
    elif s <= 20:
       fee = 'bb'
    elif s <= 30:
       fee = 'cc'
    elif s <= 40:
       fee = 'dd'
    elif s <= 80:
       fee = 'ee'
    elif s <= 100:
       fee = 'ff'
    elif s <= 200:
       fee = 'gg'
    else:
       fee = 'hh'
    return fee

out = allsets['Fare'].apply(Farelevel)
out = out.apply(pd.Series)
out.columns = ['Fee']

allsets = pd.concat([allsets, out],axis=1)

In [None]:
catCols = ['Sex', 'Embarked', 'Cabletter', 'Cabnumber', 'Surname', 'Title', 'Tickettype']

for c in catCols:
    allsets[c] = pd.factorize(allsets[c])[0]

# Replace missing age value with median
allsets['Age'] = allsets['Age'].fillna(allsets['Age'].median())
allsets['Fare'] = allsets['Fare'].fillna(allsets['Fare'].median())
allsets = allsets.drop(['Ticket', 'Cabin', 'Name'],axis=1)

In [None]:
# Split datasets
train = allsets.iloc[0:TrainRow,:]
test = allsets.iloc[TrainRow:,:]
test = test.drop(['Survived'],axis=1)

In [None]:
train

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,22.0,0,7.2500,0,1,3,0,1,0.0,0,0,9999,2,0.5,True,0,0,0
1,38.0,1,71.2833,0,2,1,1,1,1.0,1,1,1,2,0.5,True,1,1,1
2,26.0,0,7.9250,0,3,3,1,0,1.0,0,0,9999,1,1.0,True,2,2,2
3,35.0,0,53.1000,0,4,1,1,1,1.0,1,2,1,2,0.5,True,3,1,3
4,35.0,0,8.0500,0,5,3,0,0,0.0,0,0,9999,1,1.0,True,4,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,13.0000,0,887,2,0,0,0.0,0,0,9999,1,1.0,True,658,5,3
887,19.0,0,30.0000,0,888,1,1,0,1.0,6,92,1,1,1.0,True,233,2,3
888,28.0,0,23.4500,2,889,3,1,1,0.0,0,0,9999,4,1.5,False,600,2,6
889,26.0,1,30.0000,0,890,1,0,0,1.0,1,93,1,1,1.0,True,659,0,3


In [None]:
test

Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Cabletter,Cabnumber,Cabcount,fsize,fRatio,Adult,Surname,Title,Tickettype
0,34.5,2,7.8292,0,892,3,0,0,0,0,9999,1,1.0,True,261,0,3
1,47.0,0,7.0000,0,893,3,1,1,0,0,9999,2,0.5,True,661,1,3
2,62.0,2,9.6875,0,894,2,0,0,0,0,9999,1,1.0,True,662,0,3
3,27.0,0,8.6625,0,895,3,0,0,0,0,9999,1,1.0,True,663,0,3
4,22.0,0,12.2875,1,896,3,1,1,0,0,9999,3,1.0,True,394,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,8.0500,0,1305,3,0,0,0,0,9999,1,1.0,False,863,0,0
414,39.0,1,108.9000,0,1306,1,1,0,1,105,1,1,1.0,True,864,8,1
415,38.5,0,7.2500,0,1307,3,0,0,0,0,9999,1,1.0,True,865,0,7
416,28.0,0,8.0500,0,1308,3,0,0,0,0,9999,1,1.0,False,810,0,3


In [None]:
# Validation set 
X_trainData, X_validData, y_train, y_valid = train_test_split(train.drop(['Survived'],axis=1),
                                             train.Survived, 
                                             test_size = 0.3, 
                                             stratify=train.Survived,
                                             random_state=28)

LightGBM

In [None]:
lgbmodel = lgb.LGBMClassifier(
          boosting_type = 'gbdt', 
          max_depth = -1, 
          objective = 'binary',
          nthread = 3, 
          num_leaves = 6, 
          learning_rate =  0.5, 
          max_bin = 512, 
          subsample_for_bin = 150,
          subsample = 0.7,
          subsample_freq = 1,
          colsample_bytree = 0.65, 
          reg_alpha = 1, 
          reg_lambda = 3, 
          min_split_gain = 0.5,
          min_child_weight = 1,
          min_child_samples = 2,
          scale_pos_weight = 1,
          num_class = 1,
          metric = 'binary_error',
          n_estimators =100)

lgbmodel.fit(X_trainData, y_train, eval_set=[(X_trainData, y_train), (X_validData, y_valid)], eval_metric='error',early_stopping_rounds=50)
 

[1]	training's binary_error: 0.197432	valid_1's binary_error: 0.25
Training until validation scores don't improve for 50 rounds.
[2]	training's binary_error: 0.18138	valid_1's binary_error: 0.242537
[3]	training's binary_error: 0.166934	valid_1's binary_error: 0.238806
[4]	training's binary_error: 0.155698	valid_1's binary_error: 0.227612
[5]	training's binary_error: 0.155698	valid_1's binary_error: 0.242537
[6]	training's binary_error: 0.141252	valid_1's binary_error: 0.220149
[7]	training's binary_error: 0.146067	valid_1's binary_error: 0.208955
[8]	training's binary_error: 0.146067	valid_1's binary_error: 0.208955
[9]	training's binary_error: 0.142857	valid_1's binary_error: 0.197761
[10]	training's binary_error: 0.144462	valid_1's binary_error: 0.197761
[11]	training's binary_error: 0.131621	valid_1's binary_error: 0.182836
[12]	training's binary_error: 0.130016	valid_1's binary_error: 0.190299
[13]	training's binary_error: 0.133226	valid_1's binary_error: 0.190299
[14]	training's 

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.65,
               importance_type='split', learning_rate=0.5, max_bin=512,
               max_depth=-1, metric='binary_error', min_child_samples=2,
               min_child_weight=1, min_split_gain=0.5, n_estimators=100,
               n_jobs=-1, nthread=3, num_class=1, num_leaves=6,
               objective='binary', random_state=None, reg_alpha=1, reg_lambda=3,
               scale_pos_weight=1, silent=True, subsample=0.7,
               subsample_for_bin=150, subsample_freq=1)

In [None]:
lgbpredict = lgbmodel.predict(test)
lgbpredict

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

XGBoost

In [None]:
xgbmodel = xgb.XGBClassifier( 
            max_depth = 5,
            learning_rate = 0.5,
            n_estimators = 100,
            silent = True,
            objective = 'binary:logistic',
            nthread = -1,
            gamma = 0,
            min_child_weight = 5,
            max_delta_step = 0,
            subsampl = 0.7,
            colsample_bytre = 0.5,
            colsample_bylevel = 1,
            reg_alpha = 5,
            reg_lambda = 3,
            scale_pos_weight = 1,
            seed = 28,
            missing = None
)

xgbmodel.fit(X_trainData, y_train, eval_set=[(X_trainData, y_train), 
                                             (X_validData, y_valid)], 
                                              eval_metric='error',
                                              early_stopping_rounds=50)


[0]	validation_0-error:0.17496	validation_1-error:0.220149
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 50 rounds.
[1]	validation_0-error:0.17496	validation_1-error:0.220149
[2]	validation_0-error:0.162119	validation_1-error:0.201493
[3]	validation_0-error:0.149278	validation_1-error:0.179104
[4]	validation_0-error:0.138042	validation_1-error:0.201493
[5]	validation_0-error:0.133226	validation_1-error:0.190299
[6]	validation_0-error:0.128411	validation_1-error:0.197761
[7]	validation_0-error:0.126806	validation_1-error:0.201493
[8]	validation_0-error:0.123596	validation_1-error:0.19403
[9]	validation_0-error:0.12199	validation_1-error:0.190299
[10]	validation_0-error:0.12199	validation_1-error:0.190299
[11]	validation_0-error:0.12199	validation_1-error:0.190299
[12]	validation_0-error:0.11557	validation_1-error:0.186567
[13]	validation_0-error:0.117175	validation_1-error:0.186567
[1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytre=0.5, colsample_bytree=1,
              gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=5, reg_lambda=3, scale_pos_weight=1, seed=28,
              silent=True, subsampl=0.7, subsample=1, verbosity=1)

In [None]:
xgbpredict = xgbmodel.predict(test)
xgbpredict

array([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

GBDT

In [None]:
gbdt = GradientBoostingClassifier(subsample=0.7,learning_rate=0.03,
                                  random_state=28, n_estimators=500,
                                  min_samples_leaf=3)
gbdt.fit(X_trainData, y_train)
train_pred = gbdt.predict(X_trainData)
valid_pred = gbdt.predict(X_validData)

acc_train = gbdt.score(X_trainData, y_train)
acc_valid = gbdt.score(X_validData, y_valid)

print(acc_train)
print(acc_valid)

0.9598715890850722
0.8171641791044776


In [None]:
gbdtpredict = gbdt.predict(test)
gbdtpredict

array([0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

RF

In [None]:
rf = RandomForestClassifier(oob_score=True, max_depth=10, random_state=28, n_estimators=100, max_features=None)
rf.fit(train.drop(['Survived'],axis=1), train['Survived'])
print(rf.oob_score_)

0.835016835016835


Voting

In [None]:
eclf = VotingClassifier(estimators=[('lgb',lgbmodel),('xgb',xgbmodel),('gbdt',gbdt),('RF',rf)], voting='hard')
#使用投票法将三个模型结合在以前，estimotor采用 [(name1,clf1),(name2,clf2),...]这样的输入，和Pipeline的输入相同 voting='hard'表示硬投票
finaltrain = train.drop(['Survived'],axis=1)
finallabel = train.Survived

for clf, clf_name in zip([lgbmodel, xgbmodel, gbdt, rf, eclf],['lgb', 'xgb', 'gbdt', 'rf']):
    scores = cross_val_score(clf, finaltrain , finallabel, cv=5, scoring='accuracy')
    print('Accuracy: {:.2f} (+/- {:.2f}) [{}]'.format(scores.mean(), scores.std(), clf_name))


Accuracy: 0.81 (+/- 0.03) [lgb]
Accuracy: 0.81 (+/- 0.02) [xgb]
Accuracy: 0.80 (+/- 0.06) [gbdt]
Accuracy: 0.79 (+/- 0.09) [rf]


In [None]:
voting_clf = VotingClassifier(estimators=[
                              ('lgb',lgbmodel),
                              ('xgb',xgbmodel),
                              ('gbdt',gbdt),
                              ('RF',rf),
                              ], voting='soft')

voting_clf.fit(X_trainData, y_train)
voting_clf.score(X_validData, y_valid)
# 准确率：0.896

0.8059701492537313

In [None]:
vote = voting_clf.predict(test)
vote

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0.,
       1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 1.

In [None]:
submission = pd.DataFrame()
submission['PassengerId'] = testRaw['PassengerId']
submission['Survived'] = np.int32(vote)

In [None]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
submission.to_csv('Titanic.csv',index=False)
from google.colab import files
files.download('Titanic.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>