In [551]:
# Importing dependencies
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from splinter import Browser
from bs4 import BeautifulSoup as soup

# Test Data

In [552]:
# Displaying test csv as a dataframe
test_df = pd.read_csv('Resources/test.csv')
test_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [553]:
# Checking for any nulls
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [554]:
# Splitting Name to first and last
test_df[['last_name', 'first_name']] = test_df['Name'].str.split(', ', expand=True)
# Removing period in first name to make matching easier later
test_df['first_name'] = test_df['first_name'].str.replace('.', '')
# Removing extra quotes some first names have
test_df['first_name'] = test_df['first_name'].str.replace('"', '')
# Renaming columns to remove spaces and dropping Cabin/Name column
test_df_dropped = test_df[['PassengerId', 'Pclass', 'last_name','first_name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked']]

test_df_dropped

Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,Kelly,Mr James,male,34.5,0,0,330911,7.8292,Q
1,893,3,Wilkes,Mrs James (Ellen Needs),female,47.0,1,0,363272,7.0000,S
2,894,2,Myles,Mr Thomas Francis,male,62.0,0,0,240276,9.6875,Q
3,895,3,Wirz,Mr Albert,male,27.0,0,0,315154,8.6625,S
4,896,3,Hirvonen,Mrs Alexander (Helga E Lindqvist),female,22.0,1,1,3101298,12.2875,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector,Mr Woolf,male,,0,0,A.5. 3236,8.0500,S
414,1306,1,Oliva y Ocana,Dona Fermina,female,39.0,0,0,PC 17758,108.9000,C
415,1307,3,Saether,Mr Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S
416,1308,3,Ware,Mr Frederick,male,,0,0,359309,8.0500,S


In [555]:
# Creating Deck column, binning passengers by their class
deck_floors = []


for class_status in test_df_dropped['Pclass']:
    if class_status == 1:
        deck_floors.append('A,B,C,D,E')
    else:
        deck_floors.append('D,E,F,G')

i = 0

for k in range(len(test_df_dropped)):
    test_df_dropped.loc[k, ['Deck']] = deck_floors[i]
    i += 1


test_df_dropped

Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
0,892,3,Kelly,Mr James,male,34.5,0,0,330911,7.8292,Q,"D,E,F,G"
1,893,3,Wilkes,Mrs James (Ellen Needs),female,47.0,1,0,363272,7.0000,S,"D,E,F,G"
2,894,2,Myles,Mr Thomas Francis,male,62.0,0,0,240276,9.6875,Q,"D,E,F,G"
3,895,3,Wirz,Mr Albert,male,27.0,0,0,315154,8.6625,S,"D,E,F,G"
4,896,3,Hirvonen,Mrs Alexander (Helga E Lindqvist),female,22.0,1,1,3101298,12.2875,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector,Mr Woolf,male,,0,0,A.5. 3236,8.0500,S,"D,E,F,G"
414,1306,1,Oliva y Ocana,Dona Fermina,female,39.0,0,0,PC 17758,108.9000,C,"A,B,C,D,E"
415,1307,3,Saether,Mr Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S,"D,E,F,G"
416,1308,3,Ware,Mr Frederick,male,,0,0,359309,8.0500,S,"D,E,F,G"


In [556]:
# Filling single null value in Fare column
fare_null_index = test_df_dropped.loc[test_df_dropped['Fare'].isnull()].index
test_df_dropped.loc[fare_null_index, ['Fare']] = 7

In [557]:
# Checking age is only null column
test_df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   last_name    418 non-null    object 
 3   first_name   418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         418 non-null    float64
 10  Embarked     418 non-null    object 
 11  Deck         418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [558]:
# Extracting last and first names of Age column nulls
nulls_only_df = test_df_dropped[test_df_dropped.isnull().any(axis=1)]

test_nulls_last_names = list(nulls_only_df['last_name'])
test_nulls_first_names = list(nulls_only_df['first_name'])

nulls_only_df

Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
10,902,3,Ilieff,Mr Ylio,male,,0,0,349220,7.8958,S,"D,E,F,G"
22,914,1,Flegenheim,Mrs Alfred (Antoinette),female,,0,0,PC 17598,31.6833,S,"A,B,C,D,E"
29,921,3,Samaan,Mr Elias,male,,2,0,2662,21.6792,C,"D,E,F,G"
33,925,3,Johnston,Mrs Andrew G (Elizabeth Lily Watson),female,,1,2,W./C. 6607,23.4500,S,"D,E,F,G"
36,928,3,Roth,Miss Sarah A,female,,0,0,342712,8.0500,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1300,3,Riordan,Miss Johanna Hannah,female,,0,0,334915,7.7208,Q,"D,E,F,G"
410,1302,3,Naughton,Miss Hannah,female,,0,0,365237,7.7500,Q,"D,E,F,G"
413,1305,3,Spector,Mr Woolf,male,,0,0,A.5. 3236,8.0500,S,"D,E,F,G"
416,1308,3,Ware,Mr Frederick,male,,0,0,359309,8.0500,S,"D,E,F,G"


In [559]:
# Scraping data from titanic encyclopedia for unknown ages
browser = Browser('chrome')
url = 'https://titanicfacts.net/titanic-passenger-list/'
browser.visit(url)
html = browser.html
all_classes_tables = pd.read_html(html)

In [560]:
# Writing each class table into a csv
i = 0
place = ['1st', '2nd', '3rd']

for table in all_classes_tables:
    table.to_csv(f'Resources/{place[i]}_class', header=False, index=False)
    i += 1

In [561]:
# Merging all csv's into one dataframe 
pd.concat(map(pd.read_csv, 
              ['Resources/1st_class', 
               'Resources/2nd_class', 
               'Resources/3rd_class']), 
               ignore_index=True).to_csv('Resources/full_online_table', index=False)


In [562]:
# Viewing newly made single dataframe 
full_table_df  = pd.read_csv('Resources/full_online_table')
full_table_df

Unnamed: 0,Surname,First Names,Age,Boarded,Survivor (S) or Victim (†)
0,Allen,Miss Elisabeth Walton,29,Southampton,S
1,Allison,Mr Hudson Joshua Creighton,30,Southampton,†
2,Allison,Mrs Bessie Waldo,25,Southampton,†
3,Allison,Miss Helen Loraine,2,Southampton,†
4,Allison,Master Hudson Trevor,11m,Southampton,S
...,...,...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45,Cherbourg,†
1313,Zakarian,Mr Ortin,27,Cherbourg,†
1314,Zakarian,Mr Mapriededer,22,Cherbourg,†
1315,Zenni,Mr Philip,22,Cherbourg,S


In [563]:
# Cleaning dataframe by renaming and removing columns
full_table_df = full_table_df[['Surname', 'First Names', 'Age']]
full_table_df = full_table_df.rename(columns={'Surname': 'last_name', 'First Names': 'first_name', 'Age': 'age'})

full_table_df

Unnamed: 0,last_name,first_name,age
0,Allen,Miss Elisabeth Walton,29
1,Allison,Mr Hudson Joshua Creighton,30
2,Allison,Mrs Bessie Waldo,25
3,Allison,Miss Helen Loraine,2
4,Allison,Master Hudson Trevor,11m
...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45
1313,Zakarian,Mr Ortin,27
1314,Zakarian,Mr Mapriededer,22
1315,Zenni,Mr Philip,22


In [564]:
# Checking for nulls and data types
full_table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1317 entries, 0 to 1316
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   last_name   1317 non-null   object
 1   first_name  1317 non-null   object
 2   age         1317 non-null   object
dtypes: object(3)
memory usage: 31.0+ KB


In [565]:
# Checking for any unusual values
full_table_df['age'].unique()

array(['29', '30', '25', '2', '11m', '47', '62', '39', '53', '71', '18',
       '24', '26', '32', '60', '50', '36', '37', '46', '31', '19', '28',
       '45', '61', '42', '22', '41', '48', '59', '44', '58', '52', '33',
       '17', '27', '13', '11', '49', '43', '40', '64', '55', '70', '35',
       '51', '34', '4', '23', '54', '57', '38', '21', '16', '65', '20',
       '56', '6', '67', '63', '15', '12', '1', '10m', '8', '7m', '7', '3',
       '14', '9m', '9', '5', '66', '4m', '2m', 'NK', '10', '69', '74',
       '5m'], dtype=object)

In [566]:
# Replacing age values to match all float
full_table_df = full_table_df.replace({'11m': '0.9', 
                                    '10m': '0.8', 
                                    '9m': '0.7',
                                    '7m': '0.5',
                                    '5m': '0.4',
                                    '4m': '0.3', 
                                    '2m': '0.1', 
                                    'NK': '0'})


In [567]:
# Double checking
full_table_df['age'].unique()

array(['29', '30', '25', '2', '0.9', '47', '62', '39', '53', '71', '18',
       '24', '26', '32', '60', '50', '36', '37', '46', '31', '19', '28',
       '45', '61', '42', '22', '41', '48', '59', '44', '58', '52', '33',
       '17', '27', '13', '11', '49', '43', '40', '64', '55', '70', '35',
       '51', '34', '4', '23', '54', '57', '38', '21', '16', '65', '20',
       '56', '6', '67', '63', '15', '12', '1', '0.8', '8', '0.5', '7',
       '3', '14', '0.7', '9', '5', '66', '0.3', '0.1', '0', '10', '69',
       '74', '0.4'], dtype=object)

In [568]:
# Changing age column data type to float to match other csv's
full_table_df_clean = full_table_df.astype({'age': float})

In [569]:
# Checking data types
full_table_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1317 entries, 0 to 1316
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   last_name   1317 non-null   object 
 1   first_name  1317 non-null   object 
 2   age         1317 non-null   float64
dtypes: float64(1), object(2)
memory usage: 31.0+ KB


In [570]:
# Viewing final clean dataframe
full_table_df_clean

Unnamed: 0,last_name,first_name,age
0,Allen,Miss Elisabeth Walton,29.0
1,Allison,Mr Hudson Joshua Creighton,30.0
2,Allison,Mrs Bessie Waldo,25.0
3,Allison,Miss Helen Loraine,2.0
4,Allison,Master Hudson Trevor,0.9
...,...,...,...
1312,Youssiff (Sam’Aan),Mr Gerios,45.0
1313,Zakarian,Mr Ortin,27.0
1314,Zakarian,Mr Mapriededer,22.0
1315,Zenni,Mr Philip,22.0


In [571]:
# Creating loop to find missing age values from full table to fill in for test csv
test_ages_to_fill = []
last_names_to_look_over = []
fillable_test_ages_empty = []

for index, last_name in enumerate(test_nulls_last_names):
    first_name = test_nulls_first_names[index]

    search = full_table_df_clean.loc[full_table_df_clean['last_name'] == last_name]   
    refinded_search = search.loc[search['first_name'] == first_name]

    if len(refinded_search) == 1:
        test_ages_to_fill.append(refinded_search['age'].values[0])
    else:
        test_ages_to_fill.append(0)
        last_names_to_look_over.append(last_name)
        fillable_test_ages_empty.append({'last_name': last_name, 'first_name': first_name, 'age': ''})

In [572]:
# Printing results
print(test_ages_to_fill)
print(last_names_to_look_over)
print(f'We started with {len(test_ages_to_fill)} nulls')
print(f'Our search refined to manually go over only {len(last_names_to_look_over)}')

[32.0, 0, 17.0, 0, 0, 24.0, 0, 24.0, 30.0, 32.0, 0, 23.0, 0, 0, 25.0, 0, 28.0, 23.0, 20.0, 24.0, 0, 0, 0, 0, 24.0, 0, 0, 0, 44.0, 59.0, 30.0, 17.0, 32.0, 0, 0, 20.0, 16.0, 0, 0, 0, 34.0, 29.0, 34.0, 20.0, 0, 0, 32.0, 21.0, 0, 0, 0, 0, 0, 44.0, 43.0, 30.0, 26.0, 28.0, 0, 0, 0, 0, 25.0, 22.0, 33.0, 0, 0, 0, 0, 0, 0, 0, 44.0, 0, 0, 0, 0, 28.0, 0, 0, 0, 0, 21.0, 23.0, 34.0, 0]
['Flegenheim', 'Johnston', 'Roth', 'Franklin', 'Corey', 'Demetri', 'Lamb', "O'Donoghue", 'Willer', 'Shine', 'Thomas', 'Kiernan', 'McCoy', 'Lefebre', 'Thomas', 'Cassebeer', 'Lithman', 'Sage', 'Salomon', 'Rasmussen', 'Thomson', 'Moubarek', 'Hyman', 'Johnston', 'Khalil', 'MacKay', 'Mahon', 'Lennon', 'Saade', 'Fleming', 'Franklin', 'Nasr', 'Samaan', 'Malachard', 'McCarthy', 'Sadowitz', 'Thomas', 'Betros', 'van Billiard', 'Lockyer', "O'Keefe", 'Sage', "O'Connor", 'Risien', 'Wheeler', 'Riordan', 'Peter']
We started with 86 nulls
Our search refined to manually go over only 47


In [573]:
# Printing 
print(fillable_test_ages_empty)

[{'last_name': 'Flegenheim', 'first_name': 'Mrs Alfred (Antoinette)', 'age': ''}, {'last_name': 'Johnston', 'first_name': 'Mrs Andrew G (Elizabeth Lily Watson)', 'age': ''}, {'last_name': 'Roth', 'first_name': 'Miss Sarah A', 'age': ''}, {'last_name': 'Franklin', 'first_name': 'Mr Thomas Parham', 'age': ''}, {'last_name': 'Corey', 'first_name': 'Mrs Percy C (Mary Phyllis Elizabeth Miller)', 'age': ''}, {'last_name': 'Demetri', 'first_name': 'Mr Marinko', 'age': ''}, {'last_name': 'Lamb', 'first_name': 'Mr John Joseph', 'age': ''}, {'last_name': "O'Donoghue", 'first_name': 'Ms Bridget', 'age': ''}, {'last_name': 'Willer', 'first_name': 'Mr Aaron (Abi Weller)', 'age': ''}, {'last_name': 'Shine', 'first_name': 'Miss Ellen Natalia', 'age': ''}, {'last_name': 'Thomas', 'first_name': 'Mr John', 'age': ''}, {'last_name': 'Kiernan', 'first_name': 'Mr John', 'age': ''}, {'last_name': 'McCoy', 'first_name': 'Miss Alicia', 'age': ''}, {'last_name': 'Lefebre', 'first_name': 'Mrs Frank (Frances)', 

In [574]:
# Creating list to fill ages
fillable_test_ages = [
 {'last_name': 'Flegenheim', 'first_name': 'Mrs Alfred (Antoinette)', 'age': 48},
 {'last_name': 'Johnston', 'first_name': 'Mrs Andrew G (Elizabeth Lily Watson)', 'age': 36},
 {'last_name': 'Roth', 'first_name': 'Miss Sarah A', 'age': 31},
 {'last_name': 'Franklin', 'first_name': 'Mr Thomas Parham', 'age': 37},
 {'last_name': 'Corey', 'first_name': 'Mrs Percy C (Mary Phyllis Elizabeth Miller)', 'age': 32},
 {'last_name': 'Demetri', 'first_name': 'Mr Marinko', 'age': 23},
 {'last_name': 'Lamb', 'first_name': 'Mr John Joseph', 'age': 30},
 {'last_name': "O'Donoghue", 'first_name': 'Ms Bridget', 'age': 21},
 {'last_name': 'Willer', 'first_name': 'Mr Aaron (Abi Weller)', 'age': 37},
 {'last_name': 'Shine', 'first_name': 'Miss Ellen Natalia', 'age': 20},
 {'last_name': 'Thomas', 'first_name': 'Mr John', 'age': 34},
 {'last_name': 'Kiernan', 'first_name': 'Mr John', 'age': 25},
 {'last_name': 'McCoy', 'first_name': 'Miss Alicia', 'age': 26},
 {'last_name': 'Lefebre', 'first_name': 'Mrs Frank (Frances)', 'age': 40},
 {'last_name': 'Thomas', 'first_name': 'Mr Charles P', 'age': 20},
 {'last_name': 'Cassebeer', 'first_name': 'Mrs Henry Arthur Jr (Eleanor Genevieve Fosdick)', 'age': 36},
 {'last_name': 'Lithman', 'first_name': 'Mr Simon', 'age': 27},
 {'last_name': 'Sage', 'first_name': 'Miss Ada', 'age': 10},
 {'last_name': 'Salomon', 'first_name': 'Mr Abraham L', 'age': 43},
 {'last_name': 'Rasmussen', 'first_name': 'Mrs (Lena Jacobsen Solvang)', 'age': 63},
 {'last_name': 'Thomson', 'first_name': 'Mr Alexander Morrison', 'age': 36},
 {'last_name': 'Moubarek', 'first_name': 'Mrs George (Omine Amenia Alexander)', 'age': 25},
 {'last_name': 'Hyman', 'first_name': 'Mr Abraham', 'age': 34},
 {'last_name': 'Johnston', 'first_name': 'Master William Arthur Willie', 'age': 8},
 {'last_name': 'Khalil', 'first_name': 'Mrs Betros (Zahie Maria Elias)', 'age': 20},
 {'last_name': 'MacKay', 'first_name': 'Mr George William', 'age': 20},
 {'last_name': 'Mahon', 'first_name': 'Mr John', 'age': 22},
 {'last_name': 'Lennon', 'first_name': 'Miss Mary', 'age': 18},
 {'last_name': 'Saade', 'first_name': 'Mr Jean Nassr', 'age': 20},
 {'last_name': 'Fleming', 'first_name': 'Miss Honora', 'age': 22},
 {'last_name': 'Franklin', 'first_name': 'Mr Charles (Charles Fardon)', 'age': 46},
 {'last_name': 'Nasr', 'first_name': 'Mr Mustafa', 'age': 20},
 {'last_name': 'Samaan', 'first_name': 'Mr Hanna', 'age': 40},
 {'last_name': 'Malachard', 'first_name': 'Mr Noel', 'age': 25},
 {'last_name': 'McCarthy', 'first_name': 'Miss Catherine Katie', 'age': 24},
 {'last_name': 'Sadowitz', 'first_name': 'Mr Harry', 'age': 17},
 {'last_name': 'Thomas', 'first_name': 'Mr Tannous', 'age': 16},
 {'last_name': 'Betros', 'first_name': 'Master Seman', 'age': 10},
 {'last_name': 'van Billiard', 'first_name': 'Master James William', 'age': 10},
 {'last_name': 'Lockyer', 'first_name': 'Mr Edward', 'age': 19},
 {'last_name': "O'Keefe", 'first_name': 'Mr Patrick', 'age': 21},
 {'last_name': 'Sage', 'first_name': 'Mrs John (Annie Bullen)', 'age': 44},
 {'last_name': "O'Connor", 'first_name': 'Mr Patrick', 'age': 23},
 {'last_name': 'Risien', 'first_name': 'Mrs Samuel (Emma)', 'age': 58},
 {'last_name': 'Wheeler', 'first_name': 'Mr Edwin Frederick', 'age': 26},
 {'last_name': 'Riordan', 'first_name': 'Miss Johanna Hannah', 'age': 21},
 {'last_name': 'Peter', 'first_name': 'Master Michael J', 'age': 4}
 ]

In [575]:
# Merging queried ages to make a complete list to fill test csv nulls
i = 0
for x in range(len(test_ages_to_fill)):
    if test_ages_to_fill[x] == 0:
       test_ages_to_fill[x] = fillable_test_ages[i]['age']
       i += 1

In [576]:
# Double checking list for any 0's
test_ages_to_fill

[32.0,
 48,
 17.0,
 36,
 31,
 24.0,
 37,
 24.0,
 30.0,
 32.0,
 32,
 23.0,
 23,
 30,
 25.0,
 21,
 28.0,
 23.0,
 20.0,
 24.0,
 37,
 20,
 34,
 25,
 24.0,
 26,
 40,
 20,
 44.0,
 59.0,
 30.0,
 17.0,
 32.0,
 36,
 27,
 20.0,
 16.0,
 10,
 43,
 63,
 34.0,
 29.0,
 34.0,
 20.0,
 36,
 25,
 32.0,
 21.0,
 34,
 8,
 20,
 20,
 22,
 44.0,
 43.0,
 30.0,
 26.0,
 28.0,
 18,
 20,
 22,
 46,
 25.0,
 22.0,
 33.0,
 20,
 40,
 25,
 24,
 17,
 16,
 10,
 44.0,
 10,
 19,
 21,
 44,
 28.0,
 23,
 58,
 26,
 21,
 21.0,
 23.0,
 34.0,
 4]

In [577]:
# Inserting found null values into dataframe
i = 0

for row in range(len(test_df_dropped)):
    if test_df_dropped['Age'].isnull().iloc[row]:
        test_df_dropped.loc[row, ['Age']] = test_ages_to_fill[i]
        i += 1

test_df_dropped

Unnamed: 0,PassengerId,Pclass,last_name,first_name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
0,892,3,Kelly,Mr James,male,34.5,0,0,330911,7.8292,Q,"D,E,F,G"
1,893,3,Wilkes,Mrs James (Ellen Needs),female,47.0,1,0,363272,7.0000,S,"D,E,F,G"
2,894,2,Myles,Mr Thomas Francis,male,62.0,0,0,240276,9.6875,Q,"D,E,F,G"
3,895,3,Wirz,Mr Albert,male,27.0,0,0,315154,8.6625,S,"D,E,F,G"
4,896,3,Hirvonen,Mrs Alexander (Helga E Lindqvist),female,22.0,1,1,3101298,12.2875,S,"D,E,F,G"
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,Spector,Mr Woolf,male,23.0,0,0,A.5. 3236,8.0500,S,"D,E,F,G"
414,1306,1,Oliva y Ocana,Dona Fermina,female,39.0,0,0,PC 17758,108.9000,C,"A,B,C,D,E"
415,1307,3,Saether,Mr Simon Sivertsen,male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,S,"D,E,F,G"
416,1308,3,Ware,Mr Frederick,male,34.0,0,0,359309,8.0500,S,"D,E,F,G"


In [582]:
# Checking one last time for null values
test_df_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   last_name    418 non-null    object 
 3   first_name   418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         418 non-null    float64
 10  Embarked     418 non-null    object 
 11  Deck         418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [580]:
# Checking if age values were inserted correctly
ids_with_nulls = nulls_only_df['PassengerId']

inserted_ages = list(test_df_dropped.loc[test_df_dropped['PassengerId'].isin(ids_with_nulls)]['Age'])
if inserted_ages == test_ages_to_fill:
    print('Holy shit you did it')

Holy shit you did it


# Train Data

In [None]:
# Displaying train csv as a dataframe
train_df = pd.read_csv('Resources/train.csv')
train_df

In [None]:
train_df.info()

# Submission Data

In [None]:
# Displaying gender submission csv as a dataframe
gender_submission_df = pd.read_csv('Resources/gender_submission.csv')
gender_submission_df

In [None]:
gender_submission_df.info()