In [208]:
# Importing dependencies
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from splinter import Browser
from bs4 import BeautifulSoup as soup

In [209]:
# Displaying test csv as a dataframe
test_df = pd.read_csv('Resources/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [210]:
# Checking for any nulls
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [211]:
# Splitting Name to first and last and dropping Name and Cabin columns
test_df[['last_name', 'first_name']] = test_df['Name'].str.split(',', expand=True)
test_df_dropped = test_df[['PassengerId', 'Pclass', 'last_name','first_name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked']]

test_df_dropped

Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,Kelly,Mr. James,male,34.5,0,0,330911,7.8292,Q
1,893,3,Wilkes,Mrs. James (Ellen Needs),female,47.0,1,0,363272,7.0000,S
2,894,2,Myles,Mr. Thomas Francis,male,62.0,0,0,240276,9.6875,Q
3,895,3,Wirz,Mr. Albert,male,27.0,0,0,315154,8.6625,S
4,896,3,Hirvonen,Mrs. Alexander (Helga E Lindqvist),female,22.0,1,1,3101298,12.2875,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector,Mr. Woolf,male,,0,0,A.5. 3236,8.0500,S
414,1306,1,Oliva y Ocana,Dona. Fermina,female,39.0,0,0,PC 17758,108.9000,C
415,1307,3,Saether,Mr. Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S
416,1308,3,Ware,Mr. Frederick,male,,0,0,359309,8.0500,S


In [212]:
# Creating Deck column, binning passengers by their class
deck_floors = []

for class_status in test_df_dropped['Pclass']:
    if class_status == 1:
        deck_floors.append('A,B,C,D,E')
    else:
        deck_floors.append('D,E,F,G')

test_df_dropped['Deck'] = deck_floors

test_df_dropped

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_dropped['Deck'] = deck_floors


Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
0,892,3,Kelly,Mr. James,male,34.5,0,0,330911,7.8292,Q,"D,E,F,G"
1,893,3,Wilkes,Mrs. James (Ellen Needs),female,47.0,1,0,363272,7.0000,S,"D,E,F,G"
2,894,2,Myles,Mr. Thomas Francis,male,62.0,0,0,240276,9.6875,Q,"D,E,F,G"
3,895,3,Wirz,Mr. Albert,male,27.0,0,0,315154,8.6625,S,"D,E,F,G"
4,896,3,Hirvonen,Mrs. Alexander (Helga E Lindqvist),female,22.0,1,1,3101298,12.2875,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector,Mr. Woolf,male,,0,0,A.5. 3236,8.0500,S,"D,E,F,G"
414,1306,1,Oliva y Ocana,Dona. Fermina,female,39.0,0,0,PC 17758,108.9000,C,"A,B,C,D,E"
415,1307,3,Saether,Mr. Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S,"D,E,F,G"
416,1308,3,Ware,Mr. Frederick,male,,0,0,359309,8.0500,S,"D,E,F,G"


In [213]:
# Extracting last and first names of Age column nulls
test_nulls_last_names = list(test_df_dropped[test_df_dropped.isnull().any(axis=1)]['last_name'])
test_nulls_first_names = list(test_df_dropped[test_df_dropped.isnull().any(axis=1)]['first_name'])

In [214]:
# Scraping data from titanic encyclopedia for unknown ages
browser = Browser('chrome')
url = 'https://titanicfacts.net/titanic-passenger-list/'
browser.visit(url)
html = browser.html
all_classes_tables = pd.read_html(html)

In [215]:
# Writing each class table into a csv
i = 0
place = ['1st', '2nd', '3rd']

for table in all_classes_tables:
    table.to_csv(f'Resources/{place[i]}_class', header=False, index=False)
    i += 1

In [216]:
# Merging all csv's into one dataframe 
pd.concat(map(pd.read_csv, 
              ['Resources/1st_class', 
               'Resources/2nd_class', 
               'Resources/3rd_class']), 
               ignore_index=True).to_csv('Resources/full_online_table', index=False)


In [217]:
# Viewing newly made single dataframe 
full_table_df  = pd.read_csv('Resources/full_online_table')
full_table_df

Unnamed: 0,Surname,First Names,Age,Boarded,Survivor (S) or Victim (†)
0,Allen,Miss Elisabeth Walton,29,Southampton,S
1,Allison,Mr Hudson Joshua Creighton,30,Southampton,†
2,Allison,Mrs Bessie Waldo,25,Southampton,†
3,Allison,Miss Helen Loraine,2,Southampton,†
4,Allison,Master Hudson Trevor,11m,Southampton,S
...,...,...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45,Cherbourg,†
1313,Zakarian,Mr Ortin,27,Cherbourg,†
1314,Zakarian,Mr Mapriededer,22,Cherbourg,†
1315,Zenni,Mr Philip,22,Cherbourg,S


In [218]:
# Cleaning dataframe by renaming and removing columns
full_table_df = full_table_df[['Surname', 'First Names', 'Age']]
full_table_df = full_table_df.rename(columns={'Surname': 'last_name', 'First Names': 'first_name', 'Age': 'age'})

full_table_df

Unnamed: 0,last_name,first_name,age
0,Allen,Miss Elisabeth Walton,29
1,Allison,Mr Hudson Joshua Creighton,30
2,Allison,Mrs Bessie Waldo,25
3,Allison,Miss Helen Loraine,2
4,Allison,Master Hudson Trevor,11m
...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45
1313,Zakarian,Mr Ortin,27
1314,Zakarian,Mr Mapriededer,22
1315,Zenni,Mr Philip,22


In [219]:
# Checking for nulls and data types
full_table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1317 entries, 0 to 1316
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   last_name   1317 non-null   object
 1   first_name  1317 non-null   object
 2   age         1317 non-null   object
dtypes: object(3)
memory usage: 31.0+ KB


In [220]:
# Checking for any unusual values
full_table_df['age'].unique()

array(['29', '30', '25', '2', '11m', '47', '62', '39', '53', '71', '18',
       '24', '26', '32', '60', '50', '36', '37', '46', '31', '19', '28',
       '45', '61', '42', '22', '41', '48', '59', '44', '58', '52', '33',
       '17', '27', '13', '11', '49', '43', '40', '64', '55', '70', '35',
       '51', '34', '4', '23', '54', '57', '38', '21', '16', '65', '20',
       '56', '6', '67', '63', '15', '12', '1', '10m', '8', '7m', '7', '3',
       '14', '9m', '9', '5', '66', '4m', '2m', 'NK', '10', '69', '74',
       '5m'], dtype=object)

In [221]:
# Replacing age values to match all float
full_table_df = full_table_df.replace({'11m': '0.9', 
                                    '10m': '0.8', 
                                    '9m': '0.7',
                                    '7m': '0.5',
                                    '5m': '0.4',
                                    '4m': '0.3', 
                                    '2m': '0.1', 
                                    'NK': '0'})


In [222]:
# Double checking
full_table_df['age'].unique()

array(['29', '30', '25', '2', '0.9', '47', '62', '39', '53', '71', '18',
       '24', '26', '32', '60', '50', '36', '37', '46', '31', '19', '28',
       '45', '61', '42', '22', '41', '48', '59', '44', '58', '52', '33',
       '17', '27', '13', '11', '49', '43', '40', '64', '55', '70', '35',
       '51', '34', '4', '23', '54', '57', '38', '21', '16', '65', '20',
       '56', '6', '67', '63', '15', '12', '1', '0.8', '8', '0.5', '7',
       '3', '14', '0.7', '9', '5', '66', '0.3', '0.1', '0', '10', '69',
       '74', '0.4'], dtype=object)

In [223]:
# Changing age column data type to float to match other csv's
full_table_df_clean = full_table_df.astype({'age': float})

In [224]:
# Checking data types
full_table_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1317 entries, 0 to 1316
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   last_name   1317 non-null   object 
 1   first_name  1317 non-null   object 
 2   age         1317 non-null   float64
dtypes: float64(1), object(2)
memory usage: 31.0+ KB


In [225]:
# Viewing final clean dataframe
full_table_df_clean

Unnamed: 0,last_name,first_name,age
0,Allen,Miss Elisabeth Walton,29.0
1,Allison,Mr Hudson Joshua Creighton,30.0
2,Allison,Mrs Bessie Waldo,25.0
3,Allison,Miss Helen Loraine,2.0
4,Allison,Master Hudson Trevor,0.9
...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45.0
1313,Zakarian,Mr Ortin,27.0
1314,Zakarian,Mr Mapriededer,22.0
1315,Zenni,Mr Philip,22.0


In [261]:
# Creating loop to find missing age values from full table to fill in for test csv
test_ages_to_fill = []
names_to_look_over = []

for last_name in test_nulls_last_names:
    search = full_table_df_clean.loc[full_table_df_clean['last_name'] == last_name]

    if len(search) == 1:
        test_ages_to_fill.append(search['age'].values[0])
    else:
        test_ages_to_fill.append(0)
        names_to_look_over.append(last_name)

In [264]:
test_ages_to_fill

[32.0,
 48.0,
 0,
 0,
 31.0,
 24.0,
 37.0,
 24.0,
 30.0,
 0,
 30.0,
 23.0,
 0,
 30.0,
 0,
 0,
 28.0,
 23.0,
 0,
 0,
 37.0,
 20.0,
 0,
 0,
 24.0,
 0,
 0,
 0,
 44.0,
 59.0,
 30.0,
 51.0,
 0,
 32.0,
 36.0,
 0,
 20.0,
 16.0,
 0,
 43.0,
 63.0,
 0,
 29.0,
 34.0,
 20.0,
 0,
 0,
 32.0,
 21.0,
 34.0,
 0,
 0,
 0,
 20.0,
 44.0,
 43.0,
 0,
 0,
 0,
 20.0,
 0,
 0,
 37.0,
 25.0,
 0,
 33.0,
 0,
 0,
 25.0,
 0,
 17.0,
 0,
 0,
 0,
 0,
 19.0,
 0,
 0,
 0,
 0,
 0,
 24.0,
 18.0,
 21.0,
 23.0,
 0,
 0]

In [266]:
print(names_to_look_over)
print(len(names_to_look_over))

['Samaan', 'Johnston', 'Davison', 'Demetri', 'Khalil', "O'Donoghue", 'Foley', 'Ryan', 'Thomas', 'Kiernan', 'McCoy', 'Lefebre', 'Thomas', 'Smyth', 'Lithman', 'Sage', 'Murphy', 'Thomson', 'Moubarek', 'Johnston', 'Khalil', 'MacKay', 'Warren', 'Howard', 'Fox', 'Saade', 'Fleming', 'Ford', 'Nasr', 'Samaan', 'McCarthy', 'Thomas', 'Betros', 'Sage', 'van Billiard', "O'Keefe", 'Sage', 'Caram', "O'Connor", 'Risien', 'Ware', 'Peter']
42


In [267]:
full_table_df_clean.loc[full_table_df_clean['last_name'] == 'Samaan']

Unnamed: 0,last_name,first_name,age
1188,Samaan,Mr Hanna Elias,40.0
1189,Samaan,Mr Elias,17.0
1190,Samaan,Mr Youssef,16.0


In [None]:
# Displaying train csv as a dataframe
train_df = pd.read_csv('Resources/train.csv')
train_df

In [None]:
train_df.info()

In [None]:
# Displaying gender submission csv as a dataframe
gender_submission_df = pd.read_csv('Resources/gender_submission.csv')
gender_submission_df

In [None]:
gender_submission_df.info()