## ANA 500 - Mini Project 1 - Titanic
### Jeremy Krans

In [4]:
# 1 Acquire

import pandas as pd
import numpy as np

# Display
pd.set_option("display.max_columns", 20)
pd.set_option("display.width", 120)

# Files 
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"

# Read data
dtype_map = {"Ticket": "string", "Cabin": "string", "Name": "string", "Sex": "string", "Embarked": "string"}
train = pd.read_csv(TRAIN_PATH, dtype=dtype_map)
test  = pd.read_csv(TEST_PATH,  dtype=dtype_map)

# Shape and schema checks
print("Train shape:", train.shape)
print("Test  shape:", test.shape, "\n")

print("Train columns:", list(train.columns), "\n")
print("Test  columns:", list(test.columns), "\n")

print("Train info():")
train.info()
print("\nTest info():")
test.info()

# Uniqueness of PassengerId
print("\nPassengerId unique (train)?", train["PassengerId"].is_unique)
print("PassengerId unique (test)?",  test["PassengerId"].is_unique)

print("\nHead of train:")
print(train.head(5))

# Check missing
def missing_table(df, name="df"):
    m = df.isna().sum().sort_values(ascending=False)
    pct = (m/len(df)).round(3)
    out = pd.DataFrame({"n_missing": m, "pct_missing": pct})
    out.index.name = f"{name}_column"
    return out[out["n_missing"]>0]

print("\nMissing values (train):")
print(missing_table(train, "train"))

print("\nMissing values (test):")
print(missing_table(test, "test"))

# Data dictionary
data_dict = pd.DataFrame({
    "column": train.columns,
    "dtype":  [train[c].dtype for c in train.columns],
    "n_nonnull": [train[c].notna().sum() for c in train.columns],
    "n_unique":  [train[c].nunique(dropna=True) for c in train.columns]
}).sort_values("column").reset_index(drop=True)

print("\nData dictionary (train):")
print(data_dict)

# Descriptive queries 
if "Survived" in train.columns:
    # Survival rate overall
    surv_rate = train["Survived"].mean().round(3)
    print("\nOverall survival rate (train):", surv_rate)

    # Survival by Sex
    print("\nSurvival rate by Sex (train):")
    print(train.groupby("Sex")["Survived"].mean().round(3).sort_values(ascending=False))

    # Survival by Pclass
    print("\nSurvival rate by Pclass (train):")
    print(train.groupby("Pclass")["Survived"].mean().round(3))

# Category levels and counts
print("\nPclass counts (train):")
print(train["Pclass"].value_counts(dropna=False).sort_index())

print("\nSex counts (train):")
print(train["Sex"].value_counts(dropna=False))

print("\nEmbarked counts (train):")
print(train["Embarked"].value_counts(dropna=False))


data_dict.to_csv("titanic_data_dictionary_train.csv", index=False)

summary_tables = {
    "survival_by_sex.csv": train.groupby("Sex")["Survived"].mean().reset_index(),
    "survival_by_pclass.csv": train.groupby("Pclass")["Survived"].mean().reset_index(),
    "missing_train.csv": missing_table(train, "train").reset_index(),
    "missing_test.csv": missing_table(test, "test").reset_index()
}



Train shape: (891, 12)
Test  shape: (418, 11) 

Train columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] 

Test  columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'] 

Train info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    string 
 4   Sex          891 non-null    string 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    string 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    string 
 11  Embarked     889 non-null 

In [3]:
# 2 Prepare

# Handle missing values
train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])
train["Age"] = train["Age"].fillna(train["Age"].median())
train["Fare"] = train["Fare"].fillna(train["Fare"].median())

# Create simple helper columns
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
train["IsAlone"] = (train["FamilySize"] == 1).astype(int)

# Verify cleaning
print("Missing values after cleaning:")
print(train.isnull().sum())

# Exploration of patterns
print("\nAverage survival rate by Sex:")
print(train.groupby("Sex")["Survived"].mean().round(2))

print("\nAverage survival rate by Pclass:")
print(train.groupby("Pclass")["Survived"].mean().round(2))

print("\nAverage survival rate by IsAlone (1 = alone):")
print(train.groupby("IsAlone")["Survived"].mean().round(2))


Missing values after cleaning:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
IsAlone          0
dtype: int64

Average survival rate by Sex:
Sex
female    0.74
male      0.19
Name: Survived, dtype: float64

Average survival rate by Pclass:
Pclass
1    0.63
2    0.47
3    0.24
Name: Survived, dtype: float64

Average survival rate by IsAlone (1 = alone):
IsAlone
0    0.51
1    0.30
Name: Survived, dtype: float64
