In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer

In [10]:
# Load data
# path = str("/kaggle/input/titanic")
train_data = pd.read_csv(f"train.csv")
test_data = pd.read_csv(f"test.csv")
submission_data = pd.read_csv(f"gender_submission.csv")

In [11]:
# Rename columns and drop unneeded columns
for df in [train_data, test_data]:    
    df.rename(columns={"PassengerId": "passenger_id", "Survived": "survived", "Pclass": "p_class", "Name": "name", "Sex": "sex", "Age": "age", "SibSp": "sib_sp",
       "Parch": "parch", "Ticket": "ticket", "Fare": "fare", "Cabin": "cabin", "Embarked": "embarked"}, inplace=True)
    df.drop(columns=["name", "ticket", "cabin", "passenger_id"], inplace=True)


In [12]:
# Clean data function
def clean_data(train, test):
    for df in [train, test]:    
        df["embarked"] = df["embarked"].astype(str)

    train["sex"] = le.fit_transform(train["sex"].values)
    test["sex"] = le.transform(test["sex"].values)

    train["embarked"] = le.fit_transform(train["embarked"].values)
    test["embarked"] = le.transform(test["embarked"].values)

    train[["age", "fare", "embarked"]] = imputer.fit_transform(train[["age", "fare", "embarked"]], train.drop(columns=["age", "fare", "embarked", "survived"]))
    test[["age", "fare", "embarked"]] = imputer.transform(test[["age", "fare", "embarked"]])

    for df in [train, test]:
        df["age"] = df["age"].astype(int)
        df["embarked"] = df["embarked"].astype(int)

    train = pd.get_dummies(train, columns=["p_class", "sib_sp", "parch", "embarked"])
    test = pd.get_dummies(test, columns=["p_class", "sib_sp", "parch", "embarked"])
    test["embarked_3"] = False

    # Add 'parch_9' column to both training and test data
    train["parch_9"] = False
    test["parch_9"] = False

    return train, test


In [13]:
# Encode labels and impute missing values
le = LabelEncoder()
imputer = KNNImputer(n_neighbors=40)
train_df, test_df = clean_data(train_data, test_data)


In [14]:
# Add 'parch_9' column to both training and test data
train_df["parch_9"] = False
test_df["parch_9"] = False

# Train-test split
X_train = train_df.drop(columns=["survived"])
y_train = train_df["survived"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [15]:
# Initialize and fit the first XGBClassifier
xgb_1 = XGBClassifier(
    learning_rate=0.01,
    n_estimators=5000,
    early_stopping_rounds=50,
    max_depth=3,
    min_child_weight=1,
    gamma=0.5,
    colsample_bytree=0.6,
    subsample=0.8,
    reg_alpha=1e-5,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb_1.fit(X_train, y_train, eval_set=[(X_val, y_val)])


[0]	validation_0-logloss:0.67624
[1]	validation_0-logloss:0.67462
[2]	validation_0-logloss:0.67051
[3]	validation_0-logloss:0.66883
[4]	validation_0-logloss:0.66475
[5]	validation_0-logloss:0.66346
[6]	validation_0-logloss:0.66196
[7]	validation_0-logloss:0.66053
[8]	validation_0-logloss:0.65902
[9]	validation_0-logloss:0.65543
[10]	validation_0-logloss:0.65197
[11]	validation_0-logloss:0.64867
[12]	validation_0-logloss:0.64481
[13]	validation_0-logloss:0.64383
[14]	validation_0-logloss:0.64067
[15]	validation_0-logloss:0.63742
[16]	validation_0-logloss:0.63590
[17]	validation_0-logloss:0.63458
[18]	validation_0-logloss:0.63302
[19]	validation_0-logloss:0.63150
[20]	validation_0-logloss:0.62807
[21]	validation_0-logloss:0.62520
[22]	validation_0-logloss:0.62194
[23]	validation_0-logloss:0.61937
[24]	validation_0-logloss:0.61626
[25]	validation_0-logloss:0.61339
[26]	validation_0-logloss:0.61245
[27]	validation_0-logloss:0.60947
[28]	validation_0-logloss:0.60687
[29]	validation_0-loglos

[169]	validation_0-logloss:0.46509
[170]	validation_0-logloss:0.46492
[171]	validation_0-logloss:0.46426
[172]	validation_0-logloss:0.46398
[173]	validation_0-logloss:0.46329
[174]	validation_0-logloss:0.46311
[175]	validation_0-logloss:0.46268
[176]	validation_0-logloss:0.46217
[177]	validation_0-logloss:0.46173
[178]	validation_0-logloss:0.46124
[179]	validation_0-logloss:0.46122
[180]	validation_0-logloss:0.46067
[181]	validation_0-logloss:0.46010
[182]	validation_0-logloss:0.45983
[183]	validation_0-logloss:0.45920
[184]	validation_0-logloss:0.45868
[185]	validation_0-logloss:0.45820
[186]	validation_0-logloss:0.45820
[187]	validation_0-logloss:0.45751
[188]	validation_0-logloss:0.45685
[189]	validation_0-logloss:0.45627
[190]	validation_0-logloss:0.45617
[191]	validation_0-logloss:0.45600
[192]	validation_0-logloss:0.45596
[193]	validation_0-logloss:0.45565
[194]	validation_0-logloss:0.45525
[195]	validation_0-logloss:0.45457
[196]	validation_0-logloss:0.45406
[197]	validation_0-l

In [16]:
xgb_2 = XGBClassifier(
    learning_rate=0.01,
    n_estimators=5000,
    early_stopping_rounds=50,
    max_depth=3,
    min_child_weight=1,
    gamma=0.5,
    colsample_bytree=0.6,
    subsample=0.8,
    reg_alpha=1e-5,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb_2.fit(X_train, y_train, eval_set=[(X_val, y_val)])


[0]	validation_0-logloss:0.67624
[1]	validation_0-logloss:0.67462
[2]	validation_0-logloss:0.67051
[3]	validation_0-logloss:0.66883
[4]	validation_0-logloss:0.66475
[5]	validation_0-logloss:0.66346
[6]	validation_0-logloss:0.66196
[7]	validation_0-logloss:0.66053
[8]	validation_0-logloss:0.65902
[9]	validation_0-logloss:0.65543
[10]	validation_0-logloss:0.65197
[11]	validation_0-logloss:0.64867
[12]	validation_0-logloss:0.64481
[13]	validation_0-logloss:0.64383
[14]	validation_0-logloss:0.64067
[15]	validation_0-logloss:0.63742
[16]	validation_0-logloss:0.63590
[17]	validation_0-logloss:0.63458
[18]	validation_0-logloss:0.63302
[19]	validation_0-logloss:0.63150
[20]	validation_0-logloss:0.62807
[21]	validation_0-logloss:0.62520
[22]	validation_0-logloss:0.62194
[23]	validation_0-logloss:0.61937
[24]	validation_0-logloss:0.61626
[25]	validation_0-logloss:0.61339
[26]	validation_0-logloss:0.61245
[27]	validation_0-logloss:0.60947
[28]	validation_0-logloss:0.60687
[29]	validation_0-loglos

[47]	validation_0-logloss:0.57459
[48]	validation_0-logloss:0.57238
[49]	validation_0-logloss:0.57156
[50]	validation_0-logloss:0.56946
[51]	validation_0-logloss:0.56744
[52]	validation_0-logloss:0.56673
[53]	validation_0-logloss:0.56445
[54]	validation_0-logloss:0.56260
[55]	validation_0-logloss:0.56192
[56]	validation_0-logloss:0.56143
[57]	validation_0-logloss:0.55916
[58]	validation_0-logloss:0.55734
[59]	validation_0-logloss:0.55659
[60]	validation_0-logloss:0.55588
[61]	validation_0-logloss:0.55509
[62]	validation_0-logloss:0.55416
[63]	validation_0-logloss:0.55215
[64]	validation_0-logloss:0.55151
[65]	validation_0-logloss:0.54968
[66]	validation_0-logloss:0.54791
[67]	validation_0-logloss:0.54718
[68]	validation_0-logloss:0.54626
[69]	validation_0-logloss:0.54578
[70]	validation_0-logloss:0.54519
[71]	validation_0-logloss:0.54326
[72]	validation_0-logloss:0.54162
[73]	validation_0-logloss:0.53971
[74]	validation_0-logloss:0.53799
[75]	validation_0-logloss:0.53636
[76]	validatio

In [17]:
# Initialize and fit the third XGBClassifier
xgb_3 = XGBClassifier(
    learning_rate=0.01,
    n_estimators=5000,
    early_stopping_rounds=50,
    max_depth=3,
    min_child_weight=1,
    gamma=0.5,
    colsample_bytree=0.6,
    subsample=0.8,
    reg_alpha=1e-5,
    objective='binary:logistic',
    nthread=-1,
    scale_pos_weight=1,
    random_state=42
)
xgb_3.fit(X_train, y_train, eval_set=[(X_val, y_val)])


[0]	validation_0-logloss:0.67624
[1]	validation_0-logloss:0.67462
[2]	validation_0-logloss:0.67051
[3]	validation_0-logloss:0.66883
[4]	validation_0-logloss:0.66475
[5]	validation_0-logloss:0.66346
[6]	validation_0-logloss:0.66196
[7]	validation_0-logloss:0.66053
[8]	validation_0-logloss:0.65902
[9]	validation_0-logloss:0.65543
[10]	validation_0-logloss:0.65197
[11]	validation_0-logloss:0.64867
[12]	validation_0-logloss:0.64481
[13]	validation_0-logloss:0.64383
[14]	validation_0-logloss:0.64067
[15]	validation_0-logloss:0.63742
[16]	validation_0-logloss:0.63590
[17]	validation_0-logloss:0.63458
[18]	validation_0-logloss:0.63302
[19]	validation_0-logloss:0.63150
[20]	validation_0-logloss:0.62807
[21]	validation_0-logloss:0.62520
[22]	validation_0-logloss:0.62194
[23]	validation_0-logloss:0.61937
[24]	validation_0-logloss:0.61626
[25]	validation_0-logloss:0.61339


[26]	validation_0-logloss:0.61245
[27]	validation_0-logloss:0.60947
[28]	validation_0-logloss:0.60687
[29]	validation_0-logloss:0.60617
[30]	validation_0-logloss:0.60358
[31]	validation_0-logloss:0.60075
[32]	validation_0-logloss:0.59795
[33]	validation_0-logloss:0.59565
[34]	validation_0-logloss:0.59465
[35]	validation_0-logloss:0.59350
[36]	validation_0-logloss:0.59262
[37]	validation_0-logloss:0.59011
[38]	validation_0-logloss:0.58886
[39]	validation_0-logloss:0.58798
[40]	validation_0-logloss:0.58578
[41]	validation_0-logloss:0.58323
[42]	validation_0-logloss:0.58217
[43]	validation_0-logloss:0.58120
[44]	validation_0-logloss:0.58020
[45]	validation_0-logloss:0.57897
[46]	validation_0-logloss:0.57684
[47]	validation_0-logloss:0.57459
[48]	validation_0-logloss:0.57238
[49]	validation_0-logloss:0.57156
[50]	validation_0-logloss:0.56946
[51]	validation_0-logloss:0.56744
[52]	validation_0-logloss:0.56673
[53]	validation_0-logloss:0.56445
[54]	validation_0-logloss:0.56260
[55]	validatio

In [18]:
# Make predictions using the best models
test_df = test_df[X_train.columns]
predictions_1 = xgb_1.predict(test_df)
predictions_2 = xgb_2.predict(test_df)
predictions_3 = xgb_3.predict(test_df)


In [19]:
# Update submission data
submission_data["Survived_1"] = predictions_1
submission_data["Survived_2"] = predictions_2
submission_data["Survived_3"] = predictions_3


In [20]:
# Save predictions to a CSV file
submission_data.to_csv('submission_test.csv', index=False)

In [21]:
from sklearn.metrics import accuracy_score

# Load the ground truth labels for the test set
ground_truth = pd.read_csv(f"gender_submission.csv")["Survived"]

# Generate accuracy scores for each model
accuracy_1 = accuracy_score(ground_truth, predictions_1)
accuracy_2 = accuracy_score(ground_truth, predictions_2)
accuracy_3 = accuracy_score(ground_truth, predictions_3)

# Display the accuracy scores
print("Accuracy for Model 1:", accuracy_1)
print("Accuracy for Model 2:", accuracy_2)
print("Accuracy for Model 3:", accuracy_3)


Accuracy for Model 1: 0.9138755980861244
Accuracy for Model 2: 0.9138755980861244
Accuracy for Model 3: 0.9138755980861244
