# Predicting Titanic survivors 
kaggle.com/competitions/titanic/overview

### Extract the data from the csv
All our files are in the same directory

In [9]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import re #regexp lib



# We will use the sklearn preprocessing library, as it will be easier to standardize the data.
from sklearn import preprocessing

# Load the data
df_raw_csv_data = pd.read_csv('train.csv')


#copy the df and remove the targets
df_unscaled_inputs_all = df_raw_csv_data.copy()
df_unscaled_inputs_all =  df_unscaled_inputs_all.drop(columns='Survived')

#copy and take only the targets
df_targets_all = df_raw_csv_data.copy()

df_targets_all = df_targets_all['Survived']

df_serving = pd.read_csv('test.csv')

#df_targets_all.shape
#df_unscaled_inputs_all.head()

women = df_raw_csv_data.loc[df_raw_csv_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)



% of women who survived: 0.7420382165605095


### split columns to make more generic inputs

In [10]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)   

    char_remove = [r".",r"/"]
    for char in char_remove:
        df["Ticket_item"] = df["Ticket_item"].replace(char,'',inplace = True)

    
    # add a new column that just takes the letter of the cabin
    df['Cabin_letter'] = df['Cabin'].astype(str).str[0]

    #make a column that looks for "Mrs." in the name to identify married women (easiest thing to find in the name col)
    #GettingLooking the prefix of all Passengers
    df['Title'] = df['Name'].apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x))#.group(1))
    
    
    #fill any NA ages with the average age. It might be better to drop the NAs but there's a lot and I think age is important
    avg_age =df.loc[:,'Age'].mean()
    #avg_age
    df['Age'] = df['Age'].fillna(avg_age)

    #same with fare
    avg_fare =df.loc[:,'Fare'].mean()
    #avg_age
    df['Fare'] = df['Fare'].fillna(avg_fare)

    #fill in empty ports
    Port_filler ='no_port'
    #avg_age
    df['Embarked'] = df['Embarked'].fillna(Port_filler)

    #fill in empty cabins
    Cabin_filler ='no_cabin'
    #avg_age
    df['Cabin_letter'] = df['Cabin_letter'].fillna(Cabin_filler)

    sex_dummies = pd.get_dummies(df['Sex'])
    #sex_dummies

    ticket_dummies = pd.get_dummies(df['Ticket_item'])
    #ticket_dummies

    df['Ticket_type'] = df['Ticket'].apply(lambda x: x[0:3])
    df['Ticket_type'] = df['Ticket_type'].astype('category')
    df['Ticket_type'] = df['Ticket_type'].cat.codes

    df['Ticket_type'] = df['Ticket'].apply(lambda x: x[0:3])
    df['Ticket_type'] = df['Ticket_type'].astype('category')
    df['Ticket_type'] = df['Ticket_type'].cat.codes

    

    cabin_dummies = pd.get_dummies(df['Cabin_letter'], prefix='cabin')

    port_dummies = pd.get_dummies(df['Embarked'], prefix='port')




    df = pd.concat([df, sex_dummies], axis=1)
    #df = pd.concat([df, ticket_dummies], axis=1)
    df = pd.concat([df, cabin_dummies], axis=1)
    df = pd.concat([df, port_dummies], axis=1)
    #df


    #drop unneeded columns
    df_final = df.copy()
    df_final = df_final.drop(columns="Ticket")
    df_final = df_final.drop(columns="PassengerId")
    df_final = df_final.drop(columns="Sex")
    df_final = df_final.drop(columns="male")
    df_final = df_final.drop(columns="Ticket_item")
    df_final = df_final.drop(columns="Cabin")
    df_final = df_final.drop(columns="Cabin_letter")
    df_final = df_final.drop(columns="Embarked")
    df_final = df_final.drop(columns="Name")
    df_final = df_final.drop(columns="Ticket_number")

    return df_final
    
preprocessed_train_df = preprocess(df_unscaled_inputs_all)
preprocessed_serving_df = preprocess(df_serving)



preprocessed_train_df.head(10)
preprocessed_train_df.to_csv('un-dummy_train.csv')

### Get Dummies

In [11]:

#moved this to preprocessing
df_concatenated = preprocessed_train_df
df_concatenated.to_csv('pre_dropped_columns_inputs.csv')

### Drop unneeded columns

In [12]:
# input_features = list(df_concatenated.columns)
# input_features.remove("Ticket")
# input_features.remove("PassengerId")
# input_features.remove("Sex")
# input_features.remove("male")
# input_features.remove("Ticket_item")
# input_features.remove("Cabin")
# input_features.remove("Embarked")
# input_features.remove("Name")
# #input_features.remove("Survived")
# #input_features.remove("Ticket_number")

#print(f"Input features: {input_features}")

df_train_final = df_concatenated.copy()





df_train_final.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Title,Ticket_type,female,cabin_A,cabin_B,...,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,cabin_n,port_C,port_Q,port_S,port_no_port
0,3,22.0,1,0,7.25,,124,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,1,38.0,1,0,71.2833,,137,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,26.0,0,0,7.925,,148,1,0,0,...,0,0,0,0,0,1,0,0,1,0
3,1,35.0,1,0,53.1,,3,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,35.0,0,0,8.05,,97,0,0,0,...,0,0,0,0,0,1,0,0,1,0


### Shuffle the dataset

In [13]:
shuffled_indices = np.random.RandomState(seed=11).permutation(df_train_final.index)
df_train_shuffle = df_train_final.reindex(shuffled_indices)

df_targets_shuffle = df_targets_all.reindex(shuffled_indices)
df_targets_shuffle.head()


# # Count how many targets are 1 (meaning that the customer did convert)
# num_one_targets = int(np.sum(shuffled_targets))

# # Set a counter for targets that are 0 (meaning that the customer did not convert)
# zero_targets_counter = 0

# # We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# # Declare a variable that will do that:
# indices_to_remove = []

# # Count the number of targets that are 0. 
# # Once there are as many 0s as 1s, mark entries where the target is 0.
# for i in range(shuffled_targets.shape[0]):
#     if targets_all[i] == 0:
#         zero_targets_counter += 1
#         if zero_targets_counter > num_one_targets:
#             indices_to_remove.append(i)

# # Create two new variables, one that will contain the inputs, and one that will contain the targets.
# # We delete all indices that we marked "to remove" in the loop above.
# unscaled_inputs_equal_priors = np.delete(shuffled_inputs, indices_to_remove, axis=0)
# targets_equal_priors = np.delete(shuffled_targets, indices_to_remove, axis=0)

431    1
821    1
629    0
626    0
665    0
Name: Survived, dtype: int64

### Standardize the inputs

In [14]:
from sklearn.preprocessing import StandardScaler

#convert data types that don't work
#df_train_final['Ticket_number'] = df_train_final['Ticket_number'].astype(float)

#checking everythig is the right data type
#df_train_final.iloc[:,30:64].dtypes

titanic_scaler = StandardScaler()

# we could exclude our dummy variables from scaling here if we wanted to analyze the coefficients, but let's start easy and just charge forward

titanic_scaler.fit(df_train_shuffle)
scaled_inputs = titanic_scaler.transform(df_train_shuffle)


#scaled_inputs = df_train_shuffle # not actually scaled yet. we are going to use the tensorflow standardizer in our model

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


### Split the dataset into train, validation, and test

In [15]:
# Count the total number of samples
samples_count = scaled_inputs.shape[0]

# Count the samples in each subset, assuming we want 90-10 distribution of training, validation (test is already separated)
# Naturally, the numbers are integers.
train_samples_count = int(0.9 * samples_count)

# The 'test' dataset contains all remaining data.
validation_samples_count = samples_count - train_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first "train_samples_count" observations
train_inputs = scaled_inputs[:train_samples_count]
train_targets = df_targets_all[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next "validation_samples_count" observations, folllowing the "train_samples_count" we already assigned
validation_inputs = scaled_inputs[train_samples_count:]
validation_targets = df_targets_all[train_samples_count:]


# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were 
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code, 
# you will get different values, as each time they are shuffled randomly.
# Normally you preprocess ONCE, so you need not rerun this code once it is done.
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.

# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)

#print(train_inputs.dtype)
#print(train_targets.dtype)

308 801 0.38451935081148564
34 90 0.37777777777777777


### Save the three datasets in *.npz

In [16]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!




train_targets.to_csv('train_targets.csv',index=False,header=False)
np.savetxt('train_inputs.csv', train_inputs, delimiter=',')
validation_targets.to_csv('validation_targets.csv',index=False,header=False)
np.savetxt('validation_inputs.csv', validation_inputs, delimiter=',')

np.savez('Titanic_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Titanic_data_validation', inputs=validation_inputs, targets=validation_targets)
#np.savez('Titanic_data_test', inputs=test_inputs, targets=test_targets)

#train_inputs.head()