In [14]:
# %% [markdown]
# # Titanic Dataset Veracity Analysis
#
# This notebook compares the Kaggle Titanic dataset with external sources (Wikipedia and TitanicFacts.net) to assess data completeness and accuracy, focusing on passenger age information.

# %% [markdown]
# ## 1. Imports and Setup

# %%
import re
from difflib import get_close_matches

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# %% [markdown]
# ## 2. Data Loading


# %%
def load_kaggle_data(train_path="../data/train.csv", test_path="../data/test.csv"):
    """Load and concatenate Kaggle Titanic train and test datasets."""
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    return pd.concat([train, test], ignore_index=True)


def load_wikipedia_data():
    """Load Titanic passenger tables from Wikipedia, fixing smart quotes."""
    url = "https://en.wikipedia.org/wiki/Passengers_of_the_Titanic"
    resp = requests.get(url)
    html = resp.text
    # Replace smart quotes with normal quotes
    html = html.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
    tables = pd.read_html(html, header=0)
    # Filter for tables with Name and Age columns
    passenger_tables = [t for t in tables if 'Name' in t.columns and 'Age' in t.columns]
    return passenger_tables



def load_titanicfacts_data():
    """Load Titanic passenger tables from TitanicFacts.net."""
    url = "http://www.titanicfacts.net/titanic-passenger-list.html"
    return pd.read_html(url, header=0)


# %%
titanic_kaggle = load_kaggle_data()
wiki_tables = load_wikipedia_data()
facts_tables = load_titanicfacts_data()

  tables = pd.read_html(html, header=0)


In [15]:
titanic_kaggle

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [16]:
type(facts_tables)

list

In [17]:
wiki_tables

[                                                  Name Age  \
 0                                  Abelson, Mr. Samuel  30   
 1                    Abelson, Mrs. Anna (née Wizosky?)  28   
 2                             Andrew, Mr. Edgar Samuel  17   
 3                             Andrew, Mr. Frank Thomas  30   
 4                                Angle, Mr. William A.  32   
 ..                                                 ...  ..   
 273  Wilkinson, Miss Elizabeth Anne (alias Mrs. Eli...  29   
 274                       Williams, Mr. Charles Eugene  23   
 275                      Woodward, Mr. John Wesley[79]  32   
 276                                Wright, Miss Marion  26   
 277                              Yvois, Miss Henriette  24   
 
                              Hometown      Boarded  \
 0                              Russia    Cherbourg   
 1                              Russia    Cherbourg   
 2    San Ambrosio, Córdoba, Argentina  Southampton   
 3          Redruth, C