In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam 
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier # wrapper to use function from sklearn
from tensorflow.keras import backend as be
from tensorflow.keras import layers, Sequential
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV #training and testing data split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('C:\Projects\Python\Titanic\data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# import plotting libraries
import matplotlib.pyplot as plt
#import sns for better plots, it is handy to manage subplots
import seaborn as sns

C:\Projects\Python\Titanic\data\test.csv
C:\Projects\Python\Titanic\data\train.csv


In [2]:
data = pd.read_csv("C:/Projects/Python/titanic/data/train.csv")
data.head(5)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
      tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.15)
  except RuntimeError as e:
    print(e)
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session());

In [3]:
# get mean aged based on titel
data['Initial']=0
for i in data:
    data['Initial']=data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                    ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.Age.isnull())&(data.Initial=='Mr'),'Age']=33
data.loc[(data.Age.isnull())&(data.Initial=='Mrs'),'Age']=36
data.loc[(data.Age.isnull())&(data.Initial=='Master'),'Age']=5
data.loc[(data.Age.isnull())&(data.Initial=='Miss'),'Age']=22
data.loc[(data.Age.isnull())&(data.Initial=='Other'),'Age']=46
data.Age.isnull().any() #So no null values left finally 

# drop some irrelevant information
droplist = ['PassengerId', 'Name', 'Cabin', 'Ticket']
data.drop(droplist,axis=1,inplace=True)

# one hot encoding of categorical values
ohe_initial = pd.get_dummies(data['Initial'], prefix='title')
ohe_embarked = pd.get_dummies(data['Embarked'], prefix='embarked')
# turn sex into integers instead of string
data['Sex'] = data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

final_df = pd.merge(data, ohe_initial, left_index=True, right_index=True)
print(final_df)
final_df = pd.merge(final_df, ohe_embarked, left_index=True, right_index=True)
print(final_df)
droplist = ['Initial', 'Embarked']
final_df.drop(droplist,axis=1,inplace=True)

print(final_df)

#================================= end data preprocessing ================================

     Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked Initial  \
0           0       3    1  22.0      1      0   7.2500        S      Mr   
1           1       1    0  38.0      1      0  71.2833        C     Mrs   
2           1       3    0  26.0      0      0   7.9250        S    Miss   
3           1       1    0  35.0      1      0  53.1000        S     Mrs   
4           0       3    1  35.0      0      0   8.0500        S      Mr   
..        ...     ...  ...   ...    ...    ...      ...      ...     ...   
886         0       2    1  27.0      0      0  13.0000        S   Other   
887         1       1    0  19.0      0      0  30.0000        S    Miss   
888         0       3    0  22.0      1      2  23.4500        S    Miss   
889         1       1    1  26.0      0      0  30.0000        C      Mr   
890         0       3    1  32.0      0      0   7.7500        Q      Mr   

     title_Master  title_Miss  title_Mr  title_Mrs  title_Other  
0               0    

In [4]:
#========================= start data prep
train, test= train_test_split(final_df, test_size=0.15, stratify=final_df["Survived"])

Xtrain = train[train.columns[1:]]
ytrain = train[train.columns[:1]]
Xtest = test[test.columns[1:]]
ytest = test[test.columns[:1]]

# for cross validation we need to take the complete dataset and pass it as it takes care of test train split
X = final_df[final_df.columns[1:]]
y = final_df[final_df.columns[:1]]

scaler = StandardScaler()
X_normed = scaler.fit_transform(X)

print(X_normed[0])

xtrain_val = Xtrain.values
print("xtrain shape before: " + str(xtrain_val.shape))
print("xtrain shape after: " + str(xtrain_val.shape))
ytrain_val = ytrain.values

xtest_val = Xtest.values
print(xtest_val.shape)
ytest_val = ytest.values

print(xtrain_val.shape)
print(ytrain_val.shape)
print(xtest_val.shape)
print(ytest_val.shape)

xtrain_normed = scaler.fit_transform(xtrain_val)
xtest_normed = scaler.fit_transform(xtest_val)

[ 0.82737724  0.73769513 -0.59077149  0.43279337 -0.47367361 -0.50244517
 -0.21680296 -0.51364364  0.82723033 -0.40771358 -0.10101525 -0.48204268
 -0.30756234  0.61930636]
xtrain shape before: (757, 14)
xtrain shape after: (757, 14)
(134, 14)
(757, 14)
(757, 1)
(134, 14)
(134, 1)


In [5]:
def create_model(optimizer, learning_rate, first_layer_nodes, second_layer_nodes):
    if optimizer=='SGD':
        optimizer = SGD(learning_rate = learning_rate)
    if optimizer=='RMSprop':
        optimizer = RMSprop(learning_rate = learning_rate)
    if optimizer=='Adagrad':
        optimizer = Adagrad(learning_rate = learning_rate)
    if optimizer=='Adadelta':
        optimizer = Adadelta(learning_rate = learning_rate)
    if optimizer=='Adam':
        optimizer = Adam(learning_rate = learning_rate)
    if optimizer=='Adamax':
        optimizer = Adamax(learning_rate = learning_rate)
    if optimizer=='Nadam':
        optimizer = Nadam(learning_rate = learning_rate)
    
    model = Sequential()
    model.add(layers.Dense(first_layer_nodes, activation='relu'))
    model.add(layers.Dense(second_layer_nodes, activation='relu'))
    #model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    #model.add(layers.Dropout(0.2))
    
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

print(xtest_normed.shape)
print(ytest_val.shape)

#model.fit(xtrain_normed, ytrain_val, validation_data=(xtest_normed, ytest_val), epochs=50, batch_size=4, verbose=2)
sk_model = KerasClassifier(build_fn=create_model, verbose=0)

#scores = cross_val_score(sk_model, X_normed, y, cv=k_fold, scoring='accuracy')
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

optimizer = [ 'RMSprop', 'Adagrad', 'Adadelta','Adam', 'Adamax']
learning_rate = [0.03, 0.04, 0.05, 0.06]# [0.02, 0.03, 0.04, 0.05, 0.06]
batch_size = [82, 88, 92]
epochs = [3, 5, 6]
first_layer_nodes = [75, 80, 85]
second_layer_nodes = [12, 16, 20]
parameter_grid = dict(learning_rate=learning_rate, optimizer=optimizer, batch_size=batch_size, epochs=epochs, first_layer_nodes=first_layer_nodes, second_layer_nodes=second_layer_nodes)
#Best: 0.839476 using {'batch_size': 64, 'epochs': 3, 'learning_rate': 0.05, 'optimizer': 'Adagrad'}
#0.840587 using {'batch_size': 74, 'epochs': 5, 'learning_rate': 0.03, 'optimizer': 'Adam'}
#Best: 0.840612 using {'batch_size': 64, 'epochs': 3, 'learning_rate': 0.02, 'optimizer': 'Adamax'}
#0.838352 using {'batch_size': 74, 'epochs': 3, 'first_layer_nodes': 55, 'learning_rate': 0.04, 'optimizer': 'Adagrad'}
#Best: 0.840624 using {'batch_size': 88, 'epochs': 6, 'first_layer_nodes': 75, 'learning_rate': 0.05, 'optimizer': 'Adam'}
#Best: 0.845106 using {'batch_size': 92, 'epochs': 5, 'first_layer_nodes': 80, 'learning_rate': 0.05, 'optimizer': 'Adam', 'second_layer_nodes': 16}
gs = GridSearchCV(sk_model, parameter_grid, cv=k_fold, scoring='accuracy', n_jobs=6)
grid_result = gs.fit(X_normed, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

(134, 14)
(134, 1)
Best: 0.845106 using {'batch_size': 92, 'epochs': 5, 'first_layer_nodes': 80, 'learning_rate': 0.05, 'optimizer': 'Adam', 'second_layer_nodes': 16}


In [6]:
# refit the model using the optimal parameters
# edit: i manually set lowered the epochs to 5 to reduce overfitting.
opti_model = create_model('Adam', 0.05, 80, 16)
opti_model.fit(X_normed, y, batch_size=92, epochs=2, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x23a07669820>

In [None]:
xgb_dmat = xgb.DMatrix(data=xtrain_normed,label=ytrain_val)
xgb_model = xgb.XGBClassifier()
xgb_model.fit(xtrain_normed, ytrain_val)
preds = xgb_model.predict(xtest_normed)
preds = [round(value) for value in preds]
# evaluate predictions
accuracy = accuracy_score(ytest_val, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [7]:
# ==================== apply model
test_data = pd.read_csv("C:/Projects/Python/titanic/data/test.csv")

print(test_data.head())

# get mean aged based on titel
test_data['Initial']=0
for i in data:
    test_data['Initial']=test_data.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations

test_data['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Dona','Jonkheer','Col','Rev','Capt','Sir','Don'],
                    ['Mrs','Mrs','Miss','Mr','Mr','Mrs','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)


#print(test_data.loc[(test_data.Initial=='Other'),'Age'].mean())

## Assigning the NaN Values with the Ceil values of the mean ages
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mr'),'Age']=32.1
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Mrs'),'Age']=38.9
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Master'),'Age']=7.4
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Miss'),'Age']=21.8
test_data.loc[(test_data.Age.isnull())&(test_data.Initial=='Other'),'Age']=42
print(test_data.loc[test_data.Age.isnull()]) #So no null values left finally 

# retain the passengerID, used for testset
storelist = test_data['PassengerId']
# drop some irrelevant information
droplist = ['PassengerId', 'Name', 'Cabin', 'Ticket']
test_data.drop(droplist,axis=1,inplace=True)

# one hot encoding of categorical values
ohe_initial = pd.get_dummies(test_data['Initial'], prefix='title')
ohe_embarked = pd.get_dummies(test_data['Embarked'], prefix='embarked')
# turn sex into integers instead of string
test_data['Sex'] = test_data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

new_df = pd.merge(test_data, ohe_initial, left_index=True, right_index=True)
final_test_df = pd.merge(new_df, ohe_embarked, left_index=True, right_index=True)
droplist = ['Initial', 'Embarked']
final_test_df.drop(droplist,axis=1,inplace=True)
print(final_test_df)



   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  
Empty DataFrame
Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked, Initial]
Index: []
     Pclass  Sex   Age  SibSp  Parch      Fare  ti

In [8]:
# data prep
Xtest = final_test_df

#print(ytest.head())

scaler = StandardScaler()
X_normed = scaler.fit_transform(Xtest)

print(X_normed.shape)

ypred = opti_model.predict(X_normed)
print(ypred.round())

result = pd.DataFrame()
result['PassengerId'] = storelist
print("ypred 152: " + str(ypred[152]))
result["Survived"] = ypred.round()
# hack, the fare is empty, but ill just predict myself
result.iloc[152,1] = 0.0
print(result[result['PassengerId']==1044])
# result[result['PassengerId']==1044, 'Survived'] = 0.0
result["Survived"] = result["Survived"].astype(int)
print(result)
result.to_csv(r'C:\Projects\Python\titanic\my_submission.csv', index=False, header=True)

(418, 14)
[[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 