# Titanic Data Analysis and JSON Export
## Author: 
Javier Romero

## Description: 
Analyze Titanic passenger data, engineer features, and export to JSON


### Step 1: Setting Up the Project

In [9]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

# Set up paths
DATA_DIR = Path("data")
CSV_FILE = DATA_DIR / "titanic.csv"
JSON_FILE = DATA_DIR / "titanic_data.json"

# Create data directory if it doesn't exist
DATA_DIR.mkdir(exist_ok=True)

print("Project setup complete!")
print(f"Data directory: {DATA_DIR}")
print(f"CSV file location: {CSV_FILE}")


Project setup complete!
Data directory: data
CSV file location: data/titanic.csv


### Step 2: Importing and Exploring the Data

In [10]:
# Load the CSV file into a DataFrame
df = pd.read_csv(CSV_FILE)

# Display basic information about the dataset
print(f"\n{df.shape[0]} rows and {df.shape[1]} columns\n")
print(f"Dataset loaded successfully! Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())



891 rows and 12 columns

Dataset loaded successfully! Shape: (891, 12)

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

First few rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S 

### Step 3: Calculating Descriptive Statistics


In [11]:
# Identify all numeric columns
numeric_cols = df.select_dtypes(include="number").columns.tolist()
print("Numeric columns:", numeric_cols)

# Calculate mean, median, and standard deviation for each numeric column
stats = pd.DataFrame({
    "Mean": df[numeric_cols].mean(),
    "Median": df[numeric_cols].median(),
    "Std Dev": df[numeric_cols].std()
})

print("\n--- Descriptive Statistics ---")
stats


Numeric columns: ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

--- Descriptive Statistics ---


Unnamed: 0,Mean,Median,Std Dev
PassengerId,446.0,446.0,257.353842
Survived,0.383838,0.0,0.486592
Pclass,2.308642,3.0,0.836071
Age,29.699118,28.0,14.526497
SibSp,0.523008,0.0,1.102743
Parch,0.381594,0.0,0.806057
Fare,32.204208,14.4542,49.693429


### Step 4: Identifying Missing Values

In [12]:
# Count missing values and calculate percentage for each column
missing = pd.DataFrame({
    "Missing Count": df.isnull().sum(),
    "Missing %": (df.isnull().sum() / len(df) * 100).round(2)
})

# Sort by most missing data first
missing = missing.sort_values("Missing Count", ascending=False)

print("--- Missing Values by Column ---")
print(missing)

# Highlight columns with missing data
cols_with_missing = missing[missing["Missing Count"] > 0]
print(f"\nColumns with missing data: {len(cols_with_missing)} out of {len(df.columns)}")
print(f"\nMost affected columns:")
for col, row in cols_with_missing.iterrows():
    print(f"  - {col}: {int(row['Missing Count'])} missing ({row['Missing %']}%)")

--- Missing Values by Column ---
             Missing Count  Missing %
Cabin                  687      77.10
Age                    177      19.87
Embarked                 2       0.22
PassengerId              0       0.00
Survived                 0       0.00
Pclass                   0       0.00
Name                     0       0.00
Sex                      0       0.00
SibSp                    0       0.00
Parch                    0       0.00
Ticket                   0       0.00
Fare                     0       0.00

Columns with missing data: 3 out of 12

Most affected columns:
  - Cabin: 687 missing (77.1%)
  - Age: 177 missing (19.87%)
  - Embarked: 2 missing (0.22%)


### Step 5: Feature Engineering

In [14]:
# Create a copy of the dataframe for feature engineering
df_features = df.copy()

# Feature 1: Family Size
df_features['FamilySize'] = df_features['SibSp'] + df_features['Parch'] + 1
print(df_features[['SibSp', 'Parch', 'FamilySize']].head(10))

# Feature 2: Is Alone
df_features['IsAlone'] = (df_features['FamilySize'] == 1).astype(int)
print(df_features[['FamilySize', 'IsAlone']].head(10))

# Feature 3: Age Groups
def categorize_age(age):
    """Categorize age into groups"""
    if pd.isna(age):
        return 'Unknown'
    elif age < 18:
        return 'Child'
    elif age < 30:
        return 'Young Adult'
    elif age < 50:
        return 'Adult'
    else:
        return 'Senior'

df_features['AgeGroup'] = df_features['Age'].apply(categorize_age)
print(df_features[['Age', 'AgeGroup']].head(10))

# Analyze feature differences between survivors and non-survivors
print("\n" + "="*50)
print("FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED")
print("="*50)

# Here is an example to get you started:
print("\nFamily Size by Survival:")
family_survival = df_features.groupby('Survived')['FamilySize'].agg(['mean', 'median', 'std'])
print(family_survival)

# Statistical test: Do these features help differentiate?
print("\n" + "="*50)
print("FEATURE DIFFERENTIATION ANALYSIS")
print("="*50)

survived = df_features[df_features['Survived'] == 1]
not_survived = df_features[df_features['Survived'] == 0]

print("\nFamily Size:")
print(f"  Survived mean: {survived['FamilySize'].mean():.2f}")
print(f"  Not Survived mean: {not_survived['FamilySize'].mean():.2f}")
print(f"  Difference: {abs(survived['FamilySize'].mean() - not_survived['FamilySize'].mean()):.2f}")

print("\nIs Alone:")
print(f"  Survived alone %: {survived['IsAlone'].mean():.2%}")
print(f"  Not Survived alone %: {not_survived['IsAlone'].mean():.2%}")

print("\nAge Group:")
print(f"  Survived distribution:\n{survived['AgeGroup'].value_counts(normalize=True).round(3)}")
print(f"  Not Survived distribution:\n{not_survived['AgeGroup'].value_counts(normalize=True).round(3)}")

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1
5      0      0           1
6      0      0           1
7      3      1           5
8      0      2           3
9      1      0           2
   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1
5           1        1
6           1        1
7           5        0
8           3        0
9           2        0
    Age     AgeGroup
0  22.0  Young Adult
1  38.0        Adult
2  26.0  Young Adult
3  35.0        Adult
4  35.0        Adult
5   NaN      Unknown
6  54.0       Senior
7   2.0        Child
8  27.0  Young Adult
9  14.0        Child

FEATURE ANALYSIS: SURVIVED vs NOT SURVIVED

Family Size by Survival:
              mean  median       std
Survived                            
0         1.883424     1.0  1.830669
1         1.938596     2.0 

### Step 6: Creating a Data Export Class

In [23]:
from datetime import datetime

class Passenger:
    """
    Represents a passenger with all their information.
    """
    def __init__(self, passenger_id, name, age, sex, survived, pclass,
                 fare, embarked=None, family_size=None, is_alone=None, age_group=None):
        self.passenger_id = int(passenger_id) if pd.notna(passenger_id) else None
        self.name = str(name) if pd.notna(name) else None
        self.age = float(age) if pd.notna(age) else None
        self.sex = str(sex) if pd.notna(sex) else None
        self.survived = int(survived) if pd.notna(survived) else None
        self.pclass = int(pclass) if pd.notna(pclass) else None
        self.fare = float(fare) if pd.notna(fare) else None
        self.embarked = str(embarked) if pd.notna(embarked) else None
        self.family_size = int(family_size) if pd.notna(family_size) else None
        self.is_alone = int(is_alone) if pd.notna(is_alone) else None
        self.age_group = str(age_group) if pd.notna(age_group) else None

    def to_dict(self):
        """Convert passenger to dictionary for JSON serialization."""
        return {
            'passenger_id': self.passenger_id,
            'name': self.name,
            'age': self.age,
            'sex': self.sex,
            'survived': self.survived,
            'pclass': self.pclass,
            'fare': self.fare,
            'embarked': self.embarked,
            'family_size': self.family_size,
            'is_alone': self.is_alone,
            'age_group': self.age_group
        }


class TitanicDataset:
    """
    Represents the entire Titanic dataset with methods for JSON export.
    """
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.passengers = []
        self._create_passengers()

    def _create_passengers(self):
        """Create Passenger objects from dataframe."""
        for idx, row in self.dataframe.iterrows():
            passenger = Passenger(
                passenger_id=row.get('PassengerId', idx),
                name=row.get('Name', None),
                age=row.get('Age', None),
                sex=row.get('Sex', None),
                survived=row.get('Survived', None),
                pclass=row.get('Pclass', None),
                fare=row.get('Fare', None),
                embarked=row.get('Embarked', None),
                family_size=row.get('FamilySize', None),
                is_alone=row.get('IsAlone', None),
                age_group=row.get('AgeGroup', None)
            )
            self.passengers.append(passenger)

    def to_json(self, filename='titanic_data.json'):
        """Export dataset to JSON file."""
        data = {
            'metadata': {
                'dataset_name': 'Titanic Passenger Dataset',
                'export_date': datetime.now().isoformat(),
                'total_passengers': len(self.passengers),
                'survival_rate': round(self.dataframe['Survived'].mean(), 4)
            },
            'passengers': [p.to_dict() for p in self.passengers]
        }

        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"Data exported to {filename}")
        return data

    def get_summary_stats(self):
        """Get summary statistics."""
        survived_count = sum(1 for p in self.passengers if p.survived == 1)
        not_survived_count = sum(1 for p in self.passengers if p.survived == 0)

        ages = [p.age for p in self.passengers if p.age is not None]
        fares = [p.fare for p in self.passengers if p.fare is not None]

        return {
            'total_passengers': len(self.passengers),
            'survived': survived_count,
            'did_not_survive': not_survived_count,
            'average_age': round(sum(ages) / len(ages), 2) if ages else None,
            'average_fare': round(sum(fares) / len(fares), 2) if fares else None
        }


print("Passenger and TitanicDataset classes defined.")

# Create dataset object and export
if 'df_features' in locals() and not df_features.empty:
    # Create a TitanicDataset object
    dataset = TitanicDataset(df_features)

    # Print basic information
    print(f"Dataset created with {len(dataset.passengers)} passengers")

    # Get and display summary statistics
    summary = dataset.get_summary_stats()
    print("\n" + "="*50)
    print("SUMMARY STATISTICS")
    print("="*50)
    for key, value in summary.items():
        print(f"  {key}: {value}")

    # Export to JSON using the absolute path from Step 1
    export_data = dataset.to_json(str(JSON_FILE))

    # Preview first passenger
    print(f"\nFirst passenger preview:")
    print(json.dumps(export_data['passengers'][0], indent=2))

Passenger and TitanicDataset classes defined.
Dataset created with 891 passengers

SUMMARY STATISTICS
  total_passengers: 891
  survived: 342
  did_not_survive: 549
  average_age: 29.7
  average_fare: 32.2
Data exported to data/titanic_data.json

First passenger preview:
{
  "passenger_id": 1,
  "name": "Braund, Mr. Owen Harris",
  "age": 22.0,
  "sex": "male",
  "survived": 0,
  "pclass": 3,
  "fare": 7.25,
  "embarked": "S",
  "family_size": 2,
  "is_alone": 0,
  "age_group": "Young Adult"
}


### Step 7: Testing and Validation

In [24]:
# Load the JSON file and inspect its structure
with open(JSON_FILE, 'r') as f:
    loaded_data = json.load(f)

print("JSON file loaded successfully!")
print(f"Top-level keys: {list(loaded_data.keys())}")

# Inspect metadata
print("\n" + "="*50)
print("METADATA")
print("="*50)
for key, value in loaded_data['metadata'].items():
    print(f"  {key}: {value}")

# Inspect passengers structure
print("\n" + "="*50)
print("PASSENGERS")
print("="*50)
print(f"Total passengers in JSON: {len(loaded_data['passengers'])}")
print(f"Fields per passenger: {list(loaded_data['passengers'][0].keys())}")
print(f"\nFirst passenger:")
print(json.dumps(loaded_data['passengers'][0], indent=2))

JSON file loaded successfully!
Top-level keys: ['metadata', 'passengers']

METADATA
  dataset_name: Titanic Passenger Dataset
  export_date: 2026-02-02T22:33:58.282904
  total_passengers: 891
  survival_rate: 0.3838

PASSENGERS
Total passengers in JSON: 891
Fields per passenger: ['passenger_id', 'name', 'age', 'sex', 'survived', 'pclass', 'fare', 'embarked', 'family_size', 'is_alone', 'age_group']

First passenger:
{
  "passenger_id": 1,
  "name": "Braund, Mr. Owen Harris",
  "age": 22.0,
  "sex": "male",
  "survived": 0,
  "pclass": 3,
  "fare": 7.25,
  "embarked": "S",
  "family_size": 2,
  "is_alone": 0,
  "age_group": "Young Adult"
}
