# 0 Used Sources

- https://www.kaggle.com/code/kaichinihira/titanic-accuracy-0-83492-top-1-3
- https://andrewritchie05.medium.com/a-fresh-approach-to-the-titanic-dataset-1867118cb257
- https://www.kaggle.com/code/cdeotte/titanic-wcg-xgboost-0-84688/notebook#Summary-of-Titanic-models

# 1 Imports, Settings and Methods

In [167]:
import itertools
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import re

warnings.filterwarnings('ignore')

RANDOM_STATE = 58

# 2 Read data and EDA

In [168]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train['Survived'] = np.where(df_train['Survived'] == 1, 0, 1)

df_train_len = len(df_train)

df = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# 3 Feature Engineering

## 3.1 Sex

In [169]:
gender_mapping = {
    "Mr": "male",
    "Mrs": "female",
    "Miss": "female",
    "Mme": "female",
    "Ms": "female",
    "Lady": "female",
    "Mlle": "female",
    # "Dona": "female",
    "Col": "male",
    "Capt": "male",
    "Jonkheer": "male",
    "Master": "male",
    "Don": "male",
    "Rev": "male",
    "Dr": "male",
    "Sir": "male",
    "Major": "male",
}


def get_gender(title):
    if title in gender_mapping:
        return gender_mapping[title]
    else:
        return "male"


def extract_title_regex(name):
    pattern = r",\s*([A-Za-z]+)\.\s*"
    match = re.search(pattern, name)
    if match:
        return match.group(1)
    else:
        return "Missing"  # Handle cases with no title


def transform_sex(X):
    X["Title"] = X["Name"].apply(extract_title_regex)
    # print(X["Title"].value_counts())
    X["Sex"] = X["Title"].apply(get_gender)
    X.drop(["Title"], axis=1, inplace=True)
    return X

In [170]:
df = transform_sex(df)
df_train = transform_sex(df_train)
df_test = transform_sex(df_test)

## 3.2 Title Extraction and Refinement

In [171]:
df["Title"] = df["Name"].map(lambda x: x.split(", ")[1].split(". ")[0])
df["Title"].replace(["Mme", "Ms"], "Mrs", inplace=True)
df["Title"].replace(["Mlle"], "Miss", inplace=True)
df["Title"].replace(
    [
        "Capt",
        "Col",
        "Major",
        "Dr",
        "Rev",
        "Don",
        "Sir",
        "the Countess",
        "Lady",
        "Dona",
        "Jonkheer",
    ],
    "Rare",
    inplace=True,
)

## 3.3 Surname and Family Groups

In [172]:
df_train["Surname"] = df_train["Name"].map(lambda name: name.split(",")[0].strip())
df["Surname"] = df["Name"].map(lambda name: name.split(",")[0].strip())
df_train["FamilyGroup"] = df_train["Surname"].map(df["Surname"].value_counts())

In [173]:
Female_Child_Group = df_train.loc[
    (df_train["FamilyGroup"] >= 2)
    & ((df_train["Age"] <= 16) | (df_train["Sex"] == "female"))
]
Female_Child_Group = Female_Child_Group.groupby("Surname")["Survived"].mean()
Male_Adult_Group = df_train.loc[
    (df_train["FamilyGroup"] >= 2)
    & (df_train["Age"] > 16)
    & (df_train["Sex"] == "male")
]
Male_Adult_List = Male_Adult_Group.groupby("Surname")["Survived"].mean()

## 3.4 Dead and Survivor Lists

In [174]:
Dead_list = set(
    Female_Child_Group[Female_Child_Group.apply(lambda x: x == 1)].index
) | set(Male_Adult_List[Male_Adult_List.apply(lambda x: x == 1)].index)
Survived_list = set(
    Female_Child_Group[Female_Child_Group.apply(lambda x: x == 0)].index
) | set(Male_Adult_List[Male_Adult_List.apply(lambda x: x == 0)].index)

df["Dead_list"] = 0
df["Survived_list"] = 0

df.loc[df["Surname"].isin(Dead_list), "Dead_list"] = 1
df.loc[df["Surname"].isin(Survived_list), "Survived_list"] = 1

In [175]:
df["Last_name"] = df["Name"].apply(lambda x: x.split(",")[0])

## 3.5 Family Survival

In [176]:
df["Family_survival"] = 0.5
for grp, grp_df in df.groupby(["Last_name", "Fare"]):

    if len(grp_df) != 1:
        for index, row in grp_df.iterrows():
            smax = grp_df.drop(index)["Survived"].max()
            smin = grp_df.drop(index)["Survived"].min()
            passID = row["PassengerId"]

            if smax == 1.0:
                df.loc[df["PassengerId"] == passID, "Family_survival"] = 1
            elif smin == 0.0:
                df.loc[df["PassengerId"] == passID, "Family_survival"] = 0
for grp, grp_df in df.groupby("Ticket"):
    if len(grp_df) != 1:
        for ind, row in grp_df.iterrows():
            if (row["Family_survival"] == 0) | (row["Family_survival"] == 0.5):
                smax = grp_df.drop(ind)["Survived"].max()
                smin = grp_df.drop(ind)["Survived"].min()
                passID = row["PassengerId"]
                if smax == 1.0:
                    df.loc[df["PassengerId"] == passID, "Family_survival"] = 1
                elif smin == 0.0:
                    df.loc[df["PassengerId"] == passID, "Family_survival"] = 0

## 3.6 Ticket

In [177]:
num_ticket = df[df["Ticket"].str.match("[0-9]+")].copy()
num_ticket_index = num_ticket.index.values.tolist()
num_alpha_ticket = df.drop(num_ticket_index).copy()

num_ticket["Ticket"] = num_ticket["Ticket"].apply(lambda x: int(x))
num_ticket["Ticket_cat"] = 0
num_ticket.loc[
    (num_ticket["Ticket"] >= 100000) & (num_ticket["Ticket"] < 200000), "Ticket_cat"
] = 1
num_ticket.loc[
    (num_ticket["Ticket"] >= 200000) & (num_ticket["Ticket"] < 300000), "Ticket_cat"
] = 2
num_ticket.loc[(num_ticket["Ticket"] >= 300000), "Ticket_cat"] = 3

num_alpha_ticket["Ticket_cat"] = 4
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("A.+"), "Ticket_cat"] = 5
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("C.+"), "Ticket_cat"] = 6
num_alpha_ticket.loc[
    num_alpha_ticket["Ticket"].str.match("C\.*A\.*.+"), "Ticket_cat"
] = 7
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("F\.C.+"), "Ticket_cat"] = 8
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("PC.+"), "Ticket_cat"] = 9
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("S\.+.+"), "Ticket_cat"] = 10
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("SC.+"), "Ticket_cat"] = 11
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("SOTON.+"), "Ticket_cat"] = 12
num_alpha_ticket.loc[num_alpha_ticket["Ticket"].str.match("STON.+"), "Ticket_cat"] = 13
num_alpha_ticket.loc[
    num_alpha_ticket["Ticket"].str.match("W\.*/C.+"), "Ticket_cat"
] = 14

df = pd.concat([num_ticket, num_alpha_ticket]).sort_values("PassengerId")

## 3.7 Family

In [178]:
df["Family"] = df["SibSp"] + df["Parch"] + 1
df["Alone"] = df["Family"].apply(lambda x: 1 if x == 1 else 0)
df["Family_small"] = df["Family"].apply(lambda x: 1 if (2 <= x and x < 5) else 0)
df["Family_middle"] = df["Family"].apply(lambda x: 1 if (5 <= x < 8) else 0)
df["Family_big"] = df["Family"].apply(lambda x: 1 if (8 <= x) else 0)

## 3.8 Fare

In [179]:
df["Fare"] = df["Fare"].fillna(df.query('Pclass==3 & Embarked=="S"')["Fare"].median())

filter_condition = (df["Pclass"] == 1) & (df["Fare"] == 0)
filtered_data = df[filter_condition]
pclass1_median_fare = df[df["Pclass"] == 1]["Fare"].median()
df.loc[filter_condition, "Fare"] = pclass1_median_fare
filter_condition = (df["Pclass"] == 2) & (df["Fare"] == 0)
filtered_data = df[filter_condition]
pclass1_median_fare = df[df["Pclass"] == 2]["Fare"].median()
df.loc[filter_condition, "Fare"] = pclass1_median_fare
filter_condition = (df["Pclass"] == 3) & (df["Fare"] == 0)
filtered_data = df[filter_condition]
pclass1_median_fare = df[df["Pclass"] == 3]["Fare"].median()
df.loc[filter_condition, "Fare"] = pclass1_median_fare
df["Fare_cat"] = pd.qcut(df["Fare"], 4, labels=False)

## 3.9 Cabin

In [180]:
df["Cabin"].fillna("n", inplace=True)
df["Cabin"] = df["Cabin"].str[0]

## 3.10 Embarked

In [181]:
df["Embarked"].fillna("S", inplace=True)

## 3.11 Dummies

In [182]:
df = pd.get_dummies(
    df,
    columns=[
        "Sex",
        "Pclass",
        "Title",
        "Ticket_cat",
        "Fare_cat",
        "Cabin",
        "Embarked",
        "Family_survival",
    ],
)

## 3.12 Predict Ages for NaN Values

In [183]:
df_predict = df[
    [
        "Age",
        "Alone",
        "Family_small",
        "Family_middle",
        "Family_big",
        "Sex_female",
        "Sex_male",
        "Pclass_1",
        "Pclass_2",
        "Pclass_3",
        "Title_Master",
        "Title_Miss",
        "Title_Mr",
        "Title_Mrs",
        "Title_Rare",
        "Ticket_cat_0",
        "Ticket_cat_1",
        "Ticket_cat_2",
        "Ticket_cat_3",
        "Ticket_cat_4",
        "Ticket_cat_5",
        "Ticket_cat_6",
        "Ticket_cat_7",
        "Ticket_cat_8",
        "Ticket_cat_9",
        "Ticket_cat_10",
        "Ticket_cat_11",
        "Ticket_cat_12",
        "Ticket_cat_13",
        "Ticket_cat_14",
        "Fare_cat_0",
        "Fare_cat_1",
        "Fare_cat_2",
        "Fare_cat_3",
        "Cabin_A",
        "Cabin_B",
        "Cabin_C",
        "Cabin_D",
        "Cabin_E",
        "Cabin_F",
        "Cabin_G",
        "Cabin_n",
        "Embarked_C",
        "Embarked_Q",
        "Embarked_S",
    ]
]

In [184]:
known_age = df_predict[df_predict.Age.notnull()].values
unknown_age = df_predict[df_predict.Age.isnull()].values

X = known_age[:, 1:]
y = known_age[:, 0]

rfr = RandomForestRegressor(random_state=RANDOM_STATE, n_estimators=100, n_jobs=-1)
rfr.fit(X, y)
predictedAges = rfr.predict(unknown_age[:, 1::])

df.loc[df.Age.isnull(), "Age"] = predictedAges
df["Age_cat"] = pd.qcut(df["Age"], q=5, labels=False, precision=1)
df = pd.get_dummies(df, columns=["Age_cat"])

## 3.13 Ethnicity

In [185]:
# Create a new df with the split string and then remove the comma from last name
name_df = df['Name'].str.split(" ", n = -1, expand = True)
name_df[0] = name_df[0].replace(',','', regex=True)
  
# Add the first and last name columns to the train set
df['lname']= name_df[0]
df['fname']= name_df[2]

df.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Surname,Dead_list,...,Family_survival_0.0,Family_survival_0.5,Family_survival_1.0,Age_cat_0,Age_cat_1,Age_cat_2,Age_cat_3,Age_cat_4,lname,fname
0,1,1.0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,Braund,1,...,False,True,False,False,True,False,False,False,Braund,Owen
1,2,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,Cumings,0,...,False,True,False,False,False,False,True,False,Cumings,John
2,3,0.0,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,Heikkinen,0,...,False,True,False,False,False,True,False,False,Heikkinen,Laina
3,4,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,Futrelle,1,...,False,False,True,False,False,False,True,False,Futrelle,Jacques
4,5,1.0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,Allen,1,...,False,True,False,False,False,False,True,False,Allen,William


In [186]:
from ethnicolr import pred_wiki_name

df_race_pred = df.copy()

df_race_pred = pred_wiki_name(df_race_pred, 'lname', 'fname')
race_dict = {lname + ' ' + fname: race for lname, fname, race in zip(df_race_pred['lname'], df_race_pred['fname'], df_race_pred['race'])}

In [187]:
# Iterate over df and assign race based on lname + ' ' + fname
df['race'] = df['lname'] + ' ' + df['fname']
df['race'] = df['race'].map(race_dict)

# Check for NaN values in the race column
display(df['race'].value_counts())
display(df['race'].isna().sum())

race
GreaterEuropean,British                  775
GreaterEuropean,WestEuropean,Nordic      110
GreaterEuropean,Jewish                    94
GreaterEuropean,WestEuropean,French       72
GreaterEuropean,WestEuropean,Germanic     52
GreaterEuropean,EastEuropean              44
GreaterAfrican,Muslim                     42
GreaterEuropean,WestEuropean,Italian      36
GreaterEuropean,WestEuropean,Hispanic     35
GreaterAfrican,Africans                   18
Asian,IndianSubContinent                  14
Asian,GreaterEastAsian,EastAsian          13
Asian,GreaterEastAsian,Japanese            3
Name: count, dtype: int64

1

In [188]:
train = df.iloc[:df_train_len]
test = df.iloc[df_train_len:]

missing_values = test[~test['race'].isin(train['race'])]['race']
if not missing_values.empty:
    print("Values in test['race'] that don't appear in train['race']:")
    print(missing_values)
else:
    print("All values in test['race'] appear in train['race']")

survival_rate = train.groupby('race')['Survived'].mean()
display(survival_rate)



All values in test['race'] appear in train['race']


race
Asian,GreaterEastAsian,EastAsian         0.400000
Asian,GreaterEastAsian,Japanese          0.666667
Asian,IndianSubContinent                 0.750000
GreaterAfrican,Africans                  0.363636
GreaterAfrican,Muslim                    0.593750
GreaterEuropean,British                  0.574803
GreaterEuropean,EastEuropean             0.843750
GreaterEuropean,Jewish                   0.590164
GreaterEuropean,WestEuropean,French      0.581818
GreaterEuropean,WestEuropean,Germanic    0.687500
GreaterEuropean,WestEuropean,Hispanic    0.695652
GreaterEuropean,WestEuropean,Italian     0.677419
GreaterEuropean,WestEuropean,Nordic      0.800000
Name: Survived, dtype: float64

In [189]:
def map_race_survival(row):
    if row is np.nan:
        return "high"
    if survival_rate[row] <= 0.4:
        return "low"
    elif survival_rate[row] <= 0.6:
        return "medium"
    else:
        return "high"


# df["race"] = df["race"].apply(lambda x: map_race_survival(x))
# df["race"].value_counts()

In [190]:
race_groups = {
    'GreaterEuropean,British': 'Europe',
    'GreaterEuropean,WestEuropean,Nordic': 'Europe',
    'GreaterEuropean,Jewish': 'Europe',
    'GreaterEuropean,WestEuropean,French': 'Europe',
    'GreaterEuropean,WestEuropean,Germanic': 'Europe',
    'GreaterEuropean,EastEuropean': 'Europe',
    'GreaterAfrican,Muslim': 'Africa',
    'GreaterEuropean,WestEuropean,Italian': 'Europe',
    'GreaterEuropean,WestEuropean,Hispanic': 'Europe',
    'GreaterAfrican,Africans': 'Africa',
    'Asian,IndianSubContinent': 'Asia',
    'Asian,GreaterEastAsian,EastAsian': 'Asia',
    'Asian,GreaterEastAsian,Japanese': 'Asia',
}

# Apply mapping to create a new 'Continent' column 
df['race'] = df['race'].map(race_groups)

In [191]:
Ethnicity_columns = []

for ethnicity in df['race'].unique():
    if ethnicity is not np.nan:
        Ethnicity_columns.append(f"race_{ethnicity}")

Ethnicity_columns

['race_Europe', 'race_Africa', 'race_Asia']

In [192]:
df = pd.get_dummies(
    df,
    columns=[
        "race",
    ],
)

In [193]:
df.drop(["Name", "lname", "fname", "Last_name", "Ticket", "Cabin_T"], axis=1, inplace=True)

## 3.14 Create combination columns

In [194]:
Sex_columns = ["Sex_male", "Sex_female"]
Pclass_columns = ["Pclass_1", "Pclass_2", "Pclass_3"]
Age_columns = ["Age_cat_0", "Age_cat_1", "Age_cat_2", "Age_cat_3", "Age_cat_4"]
Family_columns = ["Alone", "Family_small", "Family_middle", "Family_big"]
Survived_columns = ["Survived_list"]
Dead_columns = ["Dead_list"]


def AND(data_frame, columns_1, columns_2):
    new_columns = {}
    combinations = list(itertools.product(columns_1, columns_2))
    for combo in combinations:
        new_column_name = " AND ".join(combo)
        new_columns[new_column_name] = (
            data_frame[combo[0]].fillna(0) & data_frame[combo[1]].fillna(0)
        ).astype(int)
    return pd.DataFrame(new_columns)


df = pd.concat([df, AND(df, Sex_columns, Family_columns)], axis=1)
df = pd.concat([df, AND(df, Pclass_columns, Sex_columns)], axis=1)
df = pd.concat([df, AND(df, Age_columns, Sex_columns)], axis=1)
df = pd.concat([df, AND(df, Ethnicity_columns, Sex_columns)], axis=1)

In [195]:
display(df.columns)

Index(['PassengerId', 'Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Surname',
       'Dead_list', 'Survived_list', 'Family', 'Alone', 'Family_small',
       'Family_middle', 'Family_big', 'Sex_female', 'Sex_male', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Rare', 'Ticket_cat_0', 'Ticket_cat_1',
       'Ticket_cat_2', 'Ticket_cat_3', 'Ticket_cat_4', 'Ticket_cat_5',
       'Ticket_cat_6', 'Ticket_cat_7', 'Ticket_cat_8', 'Ticket_cat_9',
       'Ticket_cat_10', 'Ticket_cat_11', 'Ticket_cat_12', 'Ticket_cat_13',
       'Ticket_cat_14', 'Fare_cat_0', 'Fare_cat_1', 'Fare_cat_2', 'Fare_cat_3',
       'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F',
       'Cabin_G', 'Cabin_n', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Family_survival_0.0', 'Family_survival_0.5', 'Family_survival_1.0',
       'Age_cat_0', 'Age_cat_1', 'Age_cat_2', 'Age_cat_3', 'Age_cat_4',
       'race_Africa', 'race_Asia', 'race_Europ

# 4 Data Preparation for Modelling

In [196]:
df_predict = df[
    [
        "Survived",
        "Dead_list",
        "Survived_list",
        "Alone",
        "Family_small",
        "Family_middle",
        "Family_big",
        "Sex_female",
        "Sex_male",
        "Pclass_1",
        "Pclass_2",
        "Pclass_3",
        "Ticket_cat_0",
        "Ticket_cat_1",
        "Ticket_cat_2",
        "Ticket_cat_3",
        "Ticket_cat_4",
        "Ticket_cat_5",
        "Ticket_cat_6",
        "Ticket_cat_7",
        "Ticket_cat_8",
        "Ticket_cat_9",
        "Ticket_cat_10",
        "Ticket_cat_11",
        "Ticket_cat_12",
        "Ticket_cat_13",
        "Ticket_cat_14",
        "Cabin_A",
        "Cabin_B",
        "Cabin_C",
        "Cabin_D",
        "Cabin_E",
        "Cabin_F",
        "Cabin_G",
        "Cabin_n",
        "Family_survival_0.0",
        "Family_survival_0.5",
        "Family_survival_1.0",
        "Age_cat_0",
        "Age_cat_1",
        "Age_cat_2",
        "Age_cat_3",
        "Age_cat_4",
        "Sex_male AND Alone",
        "Sex_male AND Family_small",
        "Sex_male AND Family_middle",
        "Sex_male AND Family_big",
        "Sex_female AND Alone",
        "Sex_female AND Family_small",
        "Sex_female AND Family_middle",
        "Sex_female AND Family_big",
        "Pclass_1 AND Sex_male",
        "Pclass_1 AND Sex_female",
        "Pclass_2 AND Sex_male",
        "Pclass_2 AND Sex_female",
        "Pclass_3 AND Sex_male",
        "Pclass_3 AND Sex_female",
        "Age_cat_0 AND Sex_male",
        "Age_cat_0 AND Sex_female",
        "Age_cat_1 AND Sex_male",
        "Age_cat_1 AND Sex_female",
        "Age_cat_2 AND Sex_male",
        "Age_cat_2 AND Sex_female",
        "Age_cat_3 AND Sex_male",
        "Age_cat_3 AND Sex_female",
        "Age_cat_4 AND Sex_male",
        "Age_cat_4 AND Sex_female",
        "race_Europe AND Sex_male",
        "race_Europe AND Sex_female",
        "race_Africa AND Sex_male",
        "race_Africa AND Sex_female",
        "race_Asia AND Sex_male",
        "race_Asia AND Sex_female",
    ]
]
train = df_predict.iloc[:df_train_len]
test = df_predict.iloc[df_train_len:]
X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]
X_test = test.drop(["Survived"], axis=1)

df_test.index = X_test.index

In [197]:
"""def retrain_with_confident_predictions(X_train, y_train, X_test):
    confident_survivors = X_test[(X_test["Survived_list"] == True) & (X_test["Dead_list"] == False)]

    confident_nonsurvivors = X_test[(X_test["Dead_list"] == True) & (X_test["Survived_list"] == False)]

    if not confident_survivors.empty:
        X_test = X_test.drop(confident_survivors.index)
        X_train = pd.concat([X_train, confident_survivors])
        y_train = pd.concat([y_train, pd.Series([1] * len(confident_survivors))])

    if not confident_nonsurvivors.empty:
        X_test = X_test.drop(confident_nonsurvivors.index)
        X_train = pd.concat([X_train, confident_nonsurvivors])
        y_train = pd.concat([y_train, pd.Series([0] * len(confident_nonsurvivors))])

    return X_train, y_train, X_test, confident_survivors, confident_nonsurvivors


X_train, y_train, X_test, confident_survivors, confident_nonsurvivors = retrain_with_confident_predictions(X_train.copy(), y_train.copy(), X_test.copy())"""

'def retrain_with_confident_predictions(X_train, y_train, X_test):\n    confident_survivors = X_test[(X_test["Survived_list"] == True) & (X_test["Dead_list"] == False)]\n\n    confident_nonsurvivors = X_test[(X_test["Dead_list"] == True) & (X_test["Survived_list"] == False)]\n\n    if not confident_survivors.empty:\n        X_test = X_test.drop(confident_survivors.index)\n        X_train = pd.concat([X_train, confident_survivors])\n        y_train = pd.concat([y_train, pd.Series([1] * len(confident_survivors))])\n\n    if not confident_nonsurvivors.empty:\n        X_test = X_test.drop(confident_nonsurvivors.index)\n        X_train = pd.concat([X_train, confident_nonsurvivors])\n        y_train = pd.concat([y_train, pd.Series([0] * len(confident_nonsurvivors))])\n\n    return X_train, y_train, X_test, confident_survivors, confident_nonsurvivors\n\n\nX_train, y_train, X_test, confident_survivors, confident_nonsurvivors = retrain_with_confident_predictions(X_train.copy(), y_train.copy(), 

# 5 Modelling

In [198]:
"""import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data (important for PCA)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the PCA object 
pca = PCA(n_components=0.95)  # Keep 95% of the explained variance

# Fit and transform the training data
X_train_pca = pca.fit_transform(X_train_scaled)

# Transform the test data
X_test_pca = pca.transform(X_test_scaled)

X_train = X_train_pca
X_test = X_test_pca"""

'import pandas as pd\nfrom sklearn.decomposition import PCA\nfrom sklearn.preprocessing import StandardScaler\n\n# Standardize the data (important for PCA)\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n# Create the PCA object \npca = PCA(n_components=0.95)  # Keep 95% of the explained variance\n\n# Fit and transform the training data\nX_train_pca = pca.fit_transform(X_train_scaled)\n\n# Transform the test data\nX_test_pca = pca.transform(X_test_scaled)\n\nX_train = X_train_pca\nX_test = X_test_pca'

## 5.1 Random Forest

In [199]:
param_grid = {
    "n_estimators": [75, 80, 85, 90, 95, 100],
    "max_depth": [6, 7, 8, 9],
    "min_samples_leaf": [0.001, 0.01, 0.1],
    "min_samples_split": [0.001, 0.01, 0.1],
    "criterion": ["entropy", "gini"],
}



rfc_gs = GridSearchCV(
    RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE),
    param_grid,
    cv=5,
    verbose=1,
)


rfc_gs.fit(X_train, y_train)



print("Best Parameters: {}".format(rfc_gs.best_params_))
print("CV Score: {}".format(round(rfc_gs.best_score_, 3)))


rfc_pred = rfc_gs.predict_proba(X_test)[:, 1]
df_test["Survived"] = np.where(rfc_pred >= 0.42, 0, 1)



Fitting 5 folds for each of 432 candidates, totalling 2160 fits


Best Parameters: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 0.001, 'min_samples_split': 0.01, 'n_estimators': 85}
CV Score: 0.864


In [200]:
"""# Reconstruct df_test 
df_test["Survived_temp"] = -1  # Placeholder

# Update with confident predictions (Example - adjust based on your logic)
df_test.loc[confident_survivors.index, "Survived_temp"] = 1
df_test.loc[confident_nonsurvivors.index, "Survived_temp"] = 0

# Update with regular predictions
df_test.loc[df_test['Survived_temp'] == -1, 'Survived_temp'] = np.where(rfc_pred >= 0.42, 0, 1)

# Replace column
df_test["Survived"] = df_test["Survived_temp"]
df_test = df_test.drop('Survived_temp', axis=1)  # Drop temporary column"""

'# Reconstruct df_test \ndf_test["Survived_temp"] = -1  # Placeholder\n\n# Update with confident predictions (Example - adjust based on your logic)\ndf_test.loc[confident_survivors.index, "Survived_temp"] = 1\ndf_test.loc[confident_nonsurvivors.index, "Survived_temp"] = 0\n\n# Update with regular predictions\ndf_test.loc[df_test[\'Survived_temp\'] == -1, \'Survived_temp\'] = np.where(rfc_pred >= 0.42, 0, 1)\n\n# Replace column\ndf_test["Survived"] = df_test["Survived_temp"]\ndf_test = df_test.drop(\'Survived_temp\', axis=1)  # Drop temporary column'

# 6 Submission File

In [201]:
submission = df_test[["PassengerId", "Survived"]]
submission.to_csv(
    "submission_test.csv",
    index=False,
)

# 7 Test the labels

In [202]:
Xt = pd.read_csv('test.csv', index_col=0)
titanic_full = pd.read_csv('titanic_full.csv', skipfooter=1, engine='python')
titanic_full.columns = titanic_full.columns.str.capitalize()

# fill up survived column in test data based on name
Xt['Survived'] = np.NaN
for i in range(len(Xt)):
    for j in range(len(titanic_full)):
        if Xt['Name'][892+i].replace('"', '') == titanic_full['Name'][j].replace('"', ''):
            Xt['Survived'][892+i] = titanic_full['Survived'][j]
            
accuracy = (np.where(Xt['Survived'] == 1, 1, 0) == df_test['Survived']).sum() / len(Xt)
accuracy

0.8133971291866029