In [60]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [61]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [62]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [63]:
import re

def _extract_title(name: str) -> str:
    pattern = r", (.*?)\."
    match = re.search(pattern, name)
    if match:
        return match.group(1).strip()
    return "Unknown"

def _normalize_title(title: str) -> str:
    title_mapping = {
        "Mr": "Mr",
        "Mrs": "Mrs",
        "Miss": "Miss",
        "Master": "Master",
        "Dr": "Officer",
        "Rev": "Officer",
        "Col": "Officer",
        "Major": "Officer",
        "Mlle": "Miss",
        "Ms": "Miss",
        "Mme": "Mrs",
        "Capt": "Officer",
        "Sir": "Royalty",
        "Lady": "Royalty",
        "Don": "Royalty",
        "Jonkheer": "Royalty",
        "Dona": "Royalty"
    }
    return title_mapping.get(title, "Other")

train_df["Title"] = train_df["Name"].apply(_extract_title).apply(_normalize_title)
test_df["Title"] = test_df["Name"].apply(_extract_title).apply(_normalize_title)

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [64]:
train_avg_ages = {}
for i, df in train_df.groupby("Title"):
    avg_age = df["Age"].dropna().mean()
    print(f"Title: {i}, Count: {len(df)}, Avg Age: {avg_age}")

    # Fill the missing age in the original df
    train_avg_ages[i] = avg_age
    df['Age'] = df['Age'].fillna(avg_age)
    train_df.loc[df.index, 'Age'] = df['Age']

for i, df in test_df.groupby("Title"):
    avg_age = train_avg_ages[i]
    df['Age'] = df['Age'].fillna(avg_age)
    test_df.loc[df.index, 'Age'] = df['Age']


Title: Master, Count: 40, Avg Age: 4.574166666666667
Title: Miss, Count: 185, Avg Age: 21.845637583892618
Title: Mr, Count: 517, Avg Age: 32.368090452261306
Title: Mrs, Count: 126, Avg Age: 35.788990825688074
Title: Officer, Count: 18, Avg Age: 46.705882352941174
Title: Other, Count: 1, Avg Age: 33.0
Title: Royalty, Count: 4, Avg Age: 43.75


In [65]:
print(train_df['Age'].isna().sum()) # shoule be 0 now
print(test_df['Age'].isna().sum())  # should be 0 now

0
0


In [66]:
# Convert sex to label
conversion = {
    "male": 0,
    "female": 1
}
train_df["Sex"] = train_df["Sex"].map(conversion)
test_df["Sex"] = test_df["Sex"].map(conversion)

# Convert title to label    
title_conversion = {
    "Mr": 0,
    "Mrs": 1,
    "Miss": 2,
    "Master": 3,
    "Officer": 4,
    "Royalty": 5,
    "Other": 6
}
train_df["Title"] = train_df["Title"].map(title_conversion)
test_df["Title"] = test_df["Title"].map(title_conversion)

print(train_df.head(3))
print(test_df.head(3))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   

             Ticket     Fare Cabin Embarked  Title  
0         A/5 21171   7.2500   NaN        S      0  
1          PC 17599  71.2833   C85        C      1  
2  STON/O2. 3101282   7.9250   NaN        S      2  
   PassengerId  Pclass                              Name  Sex   Age  SibSp  \
0          892       3                  Kelly, Mr. James    0  34.5      0   
1          893       3  Wilkes, Mrs. James (Ellen Needs)    1  47.0      1   
2          894       2         Myles, Mr. Thomas Francis    0  62.0      0   

   Pa

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Create train and valid sets

# one hot encode the categorical features
def get_wanted_cols(df):
    wanted_cols = ["Age", "Sex", "Fare"]
    for c in df.columns:
        if c.startswith("Pclass_") or c.startswith("Title_"):
            wanted_cols.append(c)
    return wanted_cols

def convert_bool_to_int(df):
    for c in df.columns:
        if df[c].dtype == bool:
            df[c] = df[c].astype(int)
    return df

train_df = train_df.set_index("PassengerId")
test_df = test_df.set_index("PassengerId")
combined_df = pd.concat([train_df, test_df], sort=False)
combined_df = pd.get_dummies(combined_df, columns=["Pclass", "Title"], drop_first=True)

X = combined_df.loc[combined_df.index.isin(train_df.index), get_wanted_cols(combined_df)]
X['Fare'] = np.log1p(X['Fare'])
X = convert_bool_to_int(X)
y = train_df["Survived"]

print(len(X), len(y))
print(X.head(3))

X_test = combined_df.loc[combined_df.index.isin(test_df.index), get_wanted_cols(combined_df)]

# Stategy 1: simple train-valid split
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
# model1 = RandomForestClassifier(n_estimators=10, random_state=42)
# model1.fit(X_train, y_train)
# accuracy = model1.score(X_valid, y_valid)
# print(f"Validation Accuracy: {accuracy}")

# pred = model1.predict(X_test)
# test_df['Survived'] = pred
# submission = test_df[['PassengerId', 'Survived']]
# submission.to_csv("./data/submission1.csv", index=False)

# Strategy 2: random forest 
model2 = RandomForestClassifier(
        n_estimators=100, 
        random_state=42, 
        oob_score=True, 
        bootstrap=True,
        max_depth=5,
        # max_leaf_nodes=20
    )
model2.fit(X, y)
print(f"OOB Score: {model2.oob_score_}")

pred = model2.predict(X_test)
test_df['Survived'] = pred
submission = test_df['Survived']
submission.to_csv("./data/submission2.csv", index=True)

891 891
              Age  Sex      Fare  Pclass_2  Pclass_3  Title_1  Title_2  \
PassengerId                                                              
1            22.0    0  2.110213         0         1        0        0   
2            38.0    1  4.280593         0         0        1        0   
3            26.0    1  2.188856         0         1        0        1   

             Title_3  Title_4  Title_5  Title_6  
PassengerId                                      
1                  0        0        0        0  
2                  0        0        0        0  
3                  0        0        0        0  
OOB Score: 0.8260381593714927
