In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("/content/drive/MyDrive/Data files/Special data files/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Data files/Special data files/test.csv")

In [3]:
x = train.drop(columns = "Survived")
y = train["Survived"]

In [4]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=0)

In [5]:
P_ID_test = test["PassengerId"]

In [6]:
# preprcoessing pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import IterativeImputer

# 1. Feature Engineering

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # FamilySize
        X["FamilySize"] = X["SibSp"] + X["Parch"]

        # Title from Name
        X["Title"] = X["Name"].str.extract("([A-Za-z]+)\.", expand=False)

        # FareBin
        X["FareBin"] = pd.cut(
            X["Fare"],
            bins=[0, 14.45, 50, 100, 350],
            labels=["Very_Low", "Low", "Medium", "High"]
        )

        # Drop unnecessary columns
        X.drop(["Name", "SibSp", "Parch", "Cabin","PassengerId"], axis=1, inplace=True)

        return X


# 2. Column Lists

encode_features = ["Embarked", "Sex", "Title", "FareBin", "Ticket"]
impute_features = ["Age"]


# 3. Pipelines

encoding_pipeline = Pipeline(steps=[
    ("encode", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

imputing_pipeline = Pipeline(steps=[
    ("impute", IterativeImputer(random_state=42))
])

preprocessor = ColumnTransformer(transformers=[
    ("encode", encoding_pipeline, encode_features),
    ("impute", imputing_pipeline, impute_features)
], remainder="passthrough")  # Pass through other columns like 'Fare', 'FamilySize'


# 4. Full Pipeline (No model)

preprocessing_pipeline = Pipeline(steps=[
    ("feature_engineering", FeatureEngineer()),
    ("preprocessing", preprocessor)
])


In [8]:
x_train_processed = preprocessing_pipeline.fit_transform(x_train)
x_test_processed = preprocessing_pipeline.transform(x_test)
test_preprocessed = preprocessing_pipeline.transform(test)

In [9]:
full_train = preprocessing_pipeline.transform(x)

In [10]:
from xgboost import XGBClassifier

In [11]:
from sklearn.metrics import accuracy_score as accuracy

In [None]:
m1 = XGBClassifier()
m1.fit(x_train_processed,y_train)

print(f"cv accuracy: {accuracy(y_test, m1.predict(x_test_processed)) * 100:.3f}%")
print(f"test accuracy: {accuracy(y_train, m1.predict(x_train_processed)) * 100:.3f}%")  #cv accuracy: 77.239% test accuracy: 99.518%

cv accuracy: 77.239%
test accuracy: 99.518%


In [None]:
m2 = XGBClassifier(n_estimators=320, max_depth=3, subsample=0.76,colsample_bytree=0.7,learning_rate=0.0183, reg_alpha = 3)
m2.fit(x_train_processed, y_train)

print(f"cv accuracy: {accuracy(y_test, m2.predict(x_test_processed)) * 100:.3f}%")
print(f"train accuracy: {accuracy(y_train, m2.predict(x_train_processed)) * 100:.3f}%")

# cv accuracy: 81.343%
# train accuracy: 84.751%
# this is using the hyperparameters of my best catboost model, now i'll predict on leaderboard


cv accuracy: 81.343%
test accuracy: 84.751%


In [None]:
predictions = (m2.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(m2.fit(full_train,y)).predict(full_train)) #79.425% accuracy on leaderbaord! 85.297% train

0.8529741863075196

In [None]:
data = pd.concat([x,test],ignore_index=True, axis=0)

In [None]:
preprocessed_data = preprocessing_pipeline.fit_transform(data)

In [None]:
tr = preprocessed_data[:891]
te = preprocessed_data[891:]


In [None]:
# now using same xgb model but on data leaked dataset
predictions = (m2.fit(tr,y)).predict(te)
rm2_3 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2_3.to_csv("data leaked xgboos tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(m2.fit(tr,y)).predict(tr))  #77.990 on leaderbaord, 85.297 train

0.8529741863075196

In [None]:
m3 = XGBClassifier(n_estimators=300, max_depth=3, subsample=0.76, colsample_bytree=0.7,learning_rate=0.018, reg_alpha = 4)

predictions = (m3.fit(full_train,y)).predict(test_preprocessed)
rm3 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm3.to_csv("tuned xgboost.csv",header=True,index=False)
accuracy(y,(m3.fit(full_train,y)).predict(full_train)) #84.736 on train, 79.186 leaderboard

0.8473625140291807

In [None]:
m3 = XGBClassifier(n_estimators=320, max_depth=3, subsample=0.8, colsample_bytree=0.8,learning_rate=0.0181, reg_alpha = 4)

predictions = (m3.fit(full_train,y)).predict(test_preprocessed)
rm3 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm3.to_csv("tuned xgboost1.csv",header=True,index=False)
accuracy(y,(m3.fit(full_train,y)).predict(full_train)) #84.84 on train, 78.468 leaderboard

0.8484848484848485

In [None]:
m3 = XGBClassifier(n_estimators=240, max_depth=4, subsample=0.8, colsample_bytree=0.8,learning_rate=0.0199, reg_alpha = 5)

predictions = (m3.fit(full_train,y)).predict(test_preprocessed)
rm3 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm3.to_csv("tuned xgboost2.csv",header=True,index=False)
accuracy(y,(m3.fit(full_train,y)).predict(full_train))  #85.74 on train, 78.708 leaderboard

0.8574635241301908

In [None]:
full_data = pd.concat([x,test],axis=0,ignore_index=True)
processed_full_data = preprocessing_pipeline.fit_transform(full_data)
leak_train = processed_full_data[:891]
leak_test = processed_full_data[891:]

In [None]:
m = XGBClassifier(n_estimators=320, max_depth=3, subsample=0.76,colsample_bytree=0.7,learning_rate=0.0183, reg_alpha = 3)

predictions = (m.fit(leak_train,y)).predict(leak_test)
mn = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
mn.to_csv("leaked xgboost.csv",header=True,index=False)
accuracy(y,(m.fit(leak_train,y)).predict(leak_train))  #train: 89.297, 77.990 leaderboard

0.8529741863075196

In [12]:
w1 = XGBClassifier(n_estimators=350, max_depth=3, subsample=0.76,colsample_bytree=0.7,learning_rate=0.0180, reg_alpha = 5)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train))  #train: 84.511, 78.708 on leaderboard

0.8451178451178452

In [13]:
w1 = XGBClassifier(n_estimators=350, max_depth=3, subsample=0.76,colsample_bytree=0.7,learning_rate=0.0178, reg_lambda = 5)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 85.634, 79.186 on leaderboard

0.856341189674523

In [14]:
w1 = XGBClassifier(n_estimators=500, max_depth=3, subsample=0.76,colsample_bytree=0.7,learning_rate=0.01, reg_lambda = 7.5)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 84.96, 79.186 on leaderboard

0.8496071829405163

In [15]:
w1 = XGBClassifier(n_estimators=600, max_depth=4, subsample=1,colsample_bytree=1,learning_rate=0.01, reg_lambda = 10)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 87.54, 76.076 on leaderboard

0.8754208754208754

In [16]:
w1 = XGBClassifier(n_estimators=200, max_depth=2, subsample=0.65,colsample_bytree=0.75,learning_rate=0.02, reg_lambda = 4)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 82.94, 78.708 on leaderboard

0.8294051627384961

In [17]:
w1 = XGBClassifier(n_estimators=245, max_depth=2, subsample=0.65,colsample_bytree=0.75,learning_rate=0.02, reg_lambda = 4)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 82.94, 78.947 on leaderboard

0.8294051627384961

In [18]:
w1 = XGBClassifier(n_estimators=180, max_depth=2, subsample=0.65,colsample_bytree=0.75,learning_rate=0.02, reg_lambda = 4)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 82.49, 78.947 on leaderboard

0.8249158249158249

In [21]:
w1 = XGBClassifier(n_estimators=500, max_depth=2, subsample=0.45,colsample_bytree=0.45,learning_rate=0.01, reg_lambda = 6.5)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 82.267, 78.708 on leaderboard

0.8226711560044894

In [22]:
w1 = XGBClassifier(n_estimators=180, max_depth=3, subsample=0.65,colsample_bytree=0.75,learning_rate=0.02, reg_lambda = 4)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 84.48, 78.947 on leaderboard

0.8484848484848485

In [23]:
w1 = XGBClassifier(n_estimators=300, max_depth=3, subsample=0.65,colsample_bytree=0.75,learning_rate=0.02, reg_lambda = 4)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 86.41, 78.947 on leaderboard

0.8641975308641975

In [24]:
w1 = XGBClassifier(n_estimators=1000, max_depth=3, subsample=0.8,colsample_bytree=0.75,learning_rate=0.02, reg_alpha = 6)

predictions = (w1.fit(full_train,y)).predict(test_preprocessed)
rm2 = pd.DataFrame({"PassengerId":P_ID_test, "Survived":predictions})
rm2.to_csv("xgboost tuned on best catboost hyperparameters.csv",header=True,index=False)
accuracy(y,(w1.fit(full_train,y)).predict(full_train)) #train: 85.40, 78.947 on leaderboard

0.8540965207631874