# Predict the testset!

Allright so this file will do essentially the same things as the previous exploratory file, but now it will execute all the preprocessing on the test set so that the predictions can be generated which will be used for in Kaggles competition (https://www.kaggle.com/c/titanic/). These predicions are in the end saved to a csv file to use in the competition

If anyone has taken the time to read through this kernel, I hope you have enjoyed. 
Cheers

In [1]:
#call libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pylab
import math
import seaborn as sns

# Set default matplot figure size
pylab.rcParams['figure.figsize'] = (6.5, 5.0)

#Turn off pandas warning for changing variables & future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None

#set random seed
np.random.seed(123)

In [2]:
#Import dataset and look at what information we are dealing with
titanic = pd.read_csv("C:\\Users\\Jeroen\\Desktop\\Kaggle Datasets\\Titanic\\Excel Files\\test.csv", header = 0)

In [3]:
#Create dummy variable for married by looping over whether passangers names' contain Mr. or Mrs.
titanic["Mr."] = 0
for i in range(0,len(titanic["Name"])):
    if "Mr." in titanic.loc[i]["Name"]:
        titanic.at[i, "Mr."] = 1

titanic["Mrs."] = 0
for i in range(0,len(titanic["Name"])):
    if "Mrs." in titanic.loc[i]["Name"]:
        titanic.at[i, "Mrs."] = 1
        
titanic["Miss."] = 0
for i in range(0,len(titanic["Name"])):
    if "Miss." in titanic.loc[i]["Name"]:
        titanic.at[i, "Miss."] = 1        

                
titanic["Master."] = 0
for i in range(0,len(titanic["Name"])):
    if "Master." in titanic.loc[i]["Name"]:
        titanic.at[i, "Master."] = 1
        
titanic["Other_Title"] = 1 - (titanic["Master."] + titanic["Miss."] + titanic["Mrs."] + titanic["Mr."])


In [4]:
#generate a zero ticket fare variable
titanic["Zero_ticket_fare"] = 0
for i in range(0,len(titanic["Fare"])):
    if titanic.loc[i]["Fare"] == 0:
        titanic.at[i, "Zero_ticket_fare"] = 1


#locate and change ticket nr.
titanic.loc[(titanic.Ticket == "LINE"), "Ticket"] = str(370160)

In [5]:
ticket_counts = titanic['Ticket'].value_counts()
ticket_counts = pd.Series.to_frame(ticket_counts)
ticket_counts["Ticket_nr"] = ticket_counts.index
ticket_counts.index = range(0,len(ticket_counts))
ticket_counts.columns = ["Ticket_group_size", "Ticket"]
titanic = pd.merge(titanic, ticket_counts, how='outer', on='Ticket')

In [6]:
#Now calculate the actual ticket value
titanic["Price_per_person"] = (titanic["Fare"] / titanic["Ticket_group_size"])

In [7]:
#generate dummies for the class variable
class_dummies = pd.get_dummies(titanic.Pclass)
class_dummies.columns = ["First_class", "Second_class", "Third_class"]
titanic = pd.concat([titanic, class_dummies], axis=1, sort=False)

In [8]:
#generate dummies for where the ship embarked
embarked_dummies = pd.get_dummies(titanic.Embarked)
embarked_dummies.columns = ["Southampton", "Cherbourg", "Queenstown"]
titanic = pd.concat([titanic, embarked_dummies], axis=1, sort=False)

In [9]:
titanic_PassengerId = titanic["PassengerId"]
titanic = titanic.drop(columns = "PassengerId")
titanic = titanic.drop(columns = "Ticket")
titanic = titanic.drop(columns = "Name")
titanic = titanic.drop(columns = "Cabin")
titanic = titanic.drop(columns = "Fare")
titanic = titanic.drop(columns = "Pclass")
titanic = titanic.drop(columns = "Embarked")

In [10]:
titanic_indep = titanic

In [11]:
titanic_indep["Sex"] = titanic_indep.Sex.astype('category')
titanic_indep["First_class"] = titanic_indep.First_class.astype('category')
titanic_indep["Second_class"] = titanic_indep.Second_class.astype('category')
titanic_indep["Third_class"] = titanic_indep.Third_class.astype('category')
titanic_indep["Southampton"] = titanic_indep.Southampton.astype('category')
titanic_indep["Cherbourg"] = titanic_indep.Cherbourg.astype('category')
titanic_indep["Queenstown"] = titanic_indep.Queenstown.astype('category')

#convert all categories into numerical variables so they can be proberly used to model with
titanic_indep.Sex = pd.CategoricalIndex(titanic_indep.Sex)
titanic_indep.First_class = pd.CategoricalIndex(titanic_indep.First_class)
titanic_indep.Second_class = pd.CategoricalIndex(titanic_indep.Second_class)
titanic_indep.Third_class = pd.CategoricalIndex(titanic_indep.Third_class)
titanic_indep.Southampton = pd.CategoricalIndex(titanic_indep.Southampton)
titanic_indep.Cherbourg = pd.CategoricalIndex(titanic_indep.Cherbourg)

titanic_indep['Sex'] = titanic_indep.Sex.cat.codes


In [12]:
#NA's make it difficult to normalize the dataframe, therefore, do it seperately 
agecolumn = titanic_indep["Age"]
Ages_noNA = agecolumn[agecolumn > -1]
Ages_yesNa = agecolumn[agecolumn.isna()]

Ages_noNA = (Ages_noNA - Ages_noNA.mean()) / (Ages_noNA.max() - Ages_noNA.min())
agecolumn = Ages_noNA.append(Ages_yesNa, ignore_index=False)

In [13]:
restcolumns = titanic_indep.loc[:, titanic_indep.columns != "Age"]
restcolumns = restcolumns.apply(pd.to_numeric)

In [14]:
for i in (range(0, len(list(restcolumns)))):
    restcolumns.iloc[:,[i]] = (restcolumns.iloc[:,[i]] - restcolumns.iloc[:,[i]].mean()) / (restcolumns.iloc[:,[i]].max() - restcolumns.iloc[:,[i]].min())

In [15]:
restcolumns["Age"] = agecolumn

In [16]:
titanic_indep = restcolumns

In [17]:
from fancyimpute import KNN
#We use the train dataframe from Titanic dataset
#fancy impute removes column names.
titanic_cols = list(titanic_indep)
# Use 5 nearest rows which have a feature to fill in each row's
# missing features
titanic_indep = pd.DataFrame(KNN(k = 9).fit_transform(titanic_indep))
titanic_indep.columns = titanic_cols
titanic_indep["Age"] = round(titanic_indep["Age"])

Using TensorFlow backend.


Imputing row 1/418 with 0 missing, elapsed time: 0.044
Imputing row 101/418 with 1 missing, elapsed time: 0.046
Imputing row 201/418 with 0 missing, elapsed time: 0.047
Imputing row 301/418 with 1 missing, elapsed time: 0.049
Imputing row 401/418 with 0 missing, elapsed time: 0.050


In [18]:
import pickle
#imort the logistic regression model   
filename = 'Logistic_reg_model.sav'
Logistic_reg_model = pickle.load(open(filename, 'rb'))
log_regression_result = Logistic_reg_model.predict(titanic_indep)

In [19]:
titanic_final_file = pd.DataFrame()
titanic_final_file["PassengerId"] = titanic_PassengerId
titanic_final_file["Survived"] = log_regression_result

In [21]:
titanic_final_file.to_csv(r'C:\Users\Jeroen\Desktop\Kaggle Datasets\Titanic\Code\Titanic Predictions.csv', index = None)