In [381]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [382]:
#reading the csv files into dataframes
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

combine = [train, test]


In [383]:
test_df = test.copy()
test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [384]:
## Going through the dataset and check how much missing value it has.
train.isnull().sum()/train.isnull().count().sort_values(ascending = False)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [385]:
test.isnull().sum()/test.isnull().count().sort_values(ascending = False)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.205742
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.002392
Cabin          0.782297
Embarked       0.000000
dtype: float64

In [386]:
## dropping cabin column
train.drop('Cabin', axis =1 , inplace = True)
test.drop('Cabin', axis = 1, inplace = True)

In [387]:
title = []
for dataset in combine:
    #dataset.loc[dataset['Name']]
    #for s in dataset['Name']:
    #    title.append((s.split(",")[1].split(".")[0].strip()))
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
#s['Title']= title
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr


In [388]:
print("Avg age of 'Miss' Title travelling without Parents:", round(train[(train.Title=="Miss") & (train.Parch==0)]['Age'].mean()))

Avg age of 'Miss' Title travelling without Parents: 28


In [389]:
print ("Avg age of 'Miss' Title travelling with Parents:", round(train[(train.Title=="Miss") & (train.Parch!=0)]['Age'].mean()))

Avg age of 'Miss' Title travelling with Parents: 12


In [390]:
for dataset in combine:
    dataset.loc[(dataset.Title == "Miss") & (dataset.Parch!= 0), 'Title'] = "FemaleChild"

In [391]:
for df in combine:
    
    df.loc[df['Title']== 'FemaleChild', 'Age'] = 12
    
    df['Fare'].fillna(df['Fare'].median(), inplace = True)
    
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
    
    df['Family Size'] = df['SibSp'] + df['Parch']

    df['Lonely'] = np.where(df['Family Size']!= 0, 0,1)

In [392]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title,Family Size,Lonely
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,Mr,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,Mrs,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,Mrs,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,Mr,0,1
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q,Mr,0,1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S,Mr,0,1
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,S,Master,4,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,S,Mrs,2,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,C,Mrs,1,0


In [393]:
means = train.groupby(['Sex', 'Title'])
train.Age = means.Age.apply(lambda x: x.fillna(x.mean()))

means = test.groupby(['Sex', 'Title'])
test.Age = means.Age.apply(lambda x: x.fillna(x.mean()))

In [394]:
test.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            1
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Title          0
Family Size    0
Lonely         0
dtype: int64

In [395]:
train[(train.Title == "Ms")]['Age'].mean()

28.0

In [396]:
test.loc[test['Title']== 'Ms', 'Age'] = 28

In [397]:
for df in combine:
    df.drop('Name', axis = 1, inplace = True)
    df.drop('Ticket', axis =1, inplace = True)
    df.drop('Title', axis = 1, inplace = True)
    df.drop('PassengerId', axis = 1, inplace = True)
    df.drop('SibSp', axis = 1, inplace =  True)
    df.drop('Parch', axis = 1, inplace = True)
    #df.drop('Family Size', axis= 1, inplace = True)
    
    df['Sex'] = pd.Categorical(df['Sex'])
    df['Embarked'] = pd.Categorical(df['Embarked'])
    #df['Lonely'] = pd.Categorical(df['Lonely'])
    
    #df = pd.get_dummies(df, drop_first = True)

In [398]:
#Convert categorical to dummy 

train = pd.get_dummies(train, drop_first = True)
test = pd.get_dummies(test, drop_first = True)

In [399]:
train.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family Size,Lonely,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,1,0,1,0,1
1,1,1,38.0,71.2833,1,0,0,0,0
2,1,3,26.0,7.925,0,1,0,0,1
3,1,1,35.0,53.1,1,0,0,0,1
4,0,3,35.0,8.05,0,1,1,0,1


In [400]:
test.head()

Unnamed: 0,Pclass,Age,Fare,Family Size,Lonely,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,1,1,1,0
1,3,47.0,7.0,1,0,0,0,1
2,2,62.0,9.6875,0,1,1,1,0
3,3,27.0,8.6625,0,1,1,0,1
4,3,22.0,12.2875,2,0,0,0,1


In [401]:
X = train.loc[:, train.columns!= "Survived"]
y=  train['Survived']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = .2, random_state = 2206)

In [402]:
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(X)
scaled_test = scaler.transform(Xtest)

In [403]:
gbk = GradientBoostingClassifier()
gbk.fit(Xtrain, ytrain)
ypred = gbk.predict(Xtest)

print("Accuracy: {}".format(metrics.accuracy_score(ytest, ypred)))

Accuracy: 0.8603351955307262


In [404]:
test_df.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [405]:
#set ids as PassengerId and predict survival 
ids = test_df['PassengerId']
predictions = gbk.predict(test)

In [406]:
#set the output as a dataframe and convert to csv file named submission.csv
submission3 = pd.DataFrame({
    'PassengerId': ids,
    'Survived': predictions,
})

submission3

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [407]:
# Export.

submission3.to_csv('submission3.csv', index=False)