In [1]:
import numpy as np
import pandas as pd 
import os 
import tensorflow as tf
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:

current_dir = Path(os.getcwd())
# construct relative path to csv file
csv_file = current_dir.parent/'Titantic_Predictor'/'train.csv'
# read csv file using pandas
train_df = pd.read_csv(csv_file)
print(train_df)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [40]:
# imports training data 
split_df = pd.read_csv('train.csv')
# Extracts the Cabin Letter using regex 
split_df['Cabin_Letter'] = train_df['Cabin'].str.extract(r'([A-Za-z]+)')
# Fills cabin letter null values with 0 
split_df['Cabin_Letter'].fillna("N", inplace = True)
# Extracts cabin number using regex
split_df['Cabin_Number'] = train_df['Cabin'].str.extract(r'(\d+)')
# Fills in cabin number null values with 0
# Converts cabin number values to numeric 
split_df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], 'coerce').fillna(0)
# Creates a dictionary to map each deck to a number
letter_map = {'N': 0, 'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}

split_df['Cabin_Numeric_Letter'] = split_df['Cabin_Letter'].map(letter_map)

print(split_df[['Cabin_Letter', 'Cabin_Number', 'Cabin_Numeric_Letter']].head())

  Cabin_Letter  Cabin_Number  Cabin_Numeric_Letter
0            N           0.0                     0
1            C          85.0                     3
2            N           0.0                     0
3            C         123.0                     3
4            N           0.0                     0


In [38]:
print(split_df[['Cabin_Letter']].value_counts())
print(split_df['Cabin_Number'].value_counts())


Cabin_Letter
N               687
C                59
B                47
D                33
E                32
A                15
F                13
G                 4
T                 1
Name: count, dtype: int64
Cabin_Number
0.0      691
33.0       7
6.0        6
101.0      5
20.0       5
        ... 
111.0      1
21.0       1
12.0       1
14.0       1
148.0      1
Name: count, Length: 93, dtype: int64
Cabin_Numeric_Letter
0    687
3     59
2     47
4     33
5     32
1     15
6     13
7      4
8      1
Name: count, dtype: int64


In [45]:
split_df.info()
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PassengerId           891 non-null    int64  
 1   Survived              891 non-null    int64  
 2   Pclass                891 non-null    int64  
 3   Name                  891 non-null    object 
 4   Sex                   891 non-null    object 
 5   Age                   714 non-null    float64
 6   SibSp                 891 non-null    int64  
 7   Parch                 891 non-null    int64  
 8   Ticket                891 non-null    object 
 9   Fare                  891 non-null    float64
 10  Cabin                 204 non-null    object 
 11  Embarked              889 non-null    object 
 12  Cabin_Letter          891 non-null    object 
 13  Cabin_Number          891 non-null    float64
 14  Cabin_Numeric_Letter  891 non-null    int64  
dtypes: float64(3), int64(6)

In [48]:
# creates a combined data frame adding the cabin number and letter columns 
combined_df = pd.concat([train_df.reset_index(), split_df.reset_index()], axis = 1)

combined_df.head()


Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,...,Age.1,SibSp.1,Parch.1,Ticket.1,Fare,Cabin,Embarked,Cabin_Letter,Cabin_Number,Cabin_Numeric_Letter
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,...,22.0,1,0,A/5 21171,7.25,,S,N,0.0,0
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,...,38.0,1,0,PC 17599,71.2833,C85,C,C,85.0,3
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,...,26.0,0,0,STON/O2. 3101282,7.925,,S,N,0.0,0
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,...,35.0,1,0,113803,53.1,C123,S,C,123.0,3
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,...,35.0,0,0,373450,8.05,,S,N,0.0,0
