In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
!pip install "mxnet<2.0.0"
!pip install autogluon

In [None]:
from autogluon.tabular import TabularPredictor
print("Successfully imported TabularPredictor!")

In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("Titanic_Dataset.csv")
test = pd.read_csv("Titanic_Dataset.csv")

In [None]:
train.head()

In [None]:
def add_salutations(dataset):
    dataset["Salutations"] = dataset["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    return dataset

def replace_rare_titles(dataset):
    dataset["Salutations"] = dataset["Salutations"].replace(["Mlle", "Ms"], "Miss")
    dataset["Salutations"] = dataset["Salutations"].replace("Mme", "Mrs")
    dataset["Salutations"] = dataset["Salutations"].replace(
        ["Lady", "Countess","Capt", "Col","Don", "Dr", "Major", "Rev", "Sir", "Dona", "Jonkheer"], "Rare"
    )
    return dataset

def calculate_family_size(dataset):
    dataset["Family_size"] = dataset.loc[:, ["SibSp", "Parch"]].sum(axis=1)
    dataset["Family_size"] = dataset["Family_size"].apply(lambda x: x + 1)
    return dataset

def apply_feature_engineering(dataset):
    dataset = add_salutations(dataset)
    dataset = replace_rare_titles(dataset)
    dataset = calculate_family_size(dataset)
    return dataset

train = apply_feature_engineering(train)
test = apply_feature_engineering(test)

In [None]:
predictor = TabularPredictor(label="Survived", eval_metric="accuracy" ).fit(
    train,
    presets="best_quality",
    time_limit=600
)

In [None]:
#AutoGluon supports 'GBM' for LightGBM, 'CAT' for CatBoost, and 'XGB' for XGBoost

hyperparameters = {
    "GBM" : {"num_boost_round" : 150, "learning_rate" : 0.05}, #Light GBM
    "CAT" : {"iterations" : 500, "learning_rate" : 0.03}, #CatBoost
    "XGB" : {"n_estimators" : 200, "max_depth" : 4} #XGBoost
}

#Training with custom hyperparameters
predictor_tuned = TabularPredictor(label="Survived", eval_metric="accuracy").fit(
    train_data = train,
    hyperparameters=hyperparameters,
    time_limit=600
)

#Summary of the hyperparameter-tuned models
predictor_tuned.fit_summary()

In [None]:
#Train the model with ensembling (bagging + stacking)
predictor_ensemble = TabularPredictor(label="Survived").fit(
    train_data=train,
    hyperparameters=hyperparameters,
    num_bag_folds=5,
    num_stack_levels=2,
    time_limit=1800
)

predictor_ensemble.leaderboard(silent=True)

In [None]:
leaderboard = predictor_ensemble.leaderboard(silent=True)
top_models = list(leaderboard["model"].head(3))

for model in top_models:
    predictor_ensemble.refit_full(model=model)

In [None]:
predictions = predictor_ensemble.predict(test)


submission = pd.DataFrame({
    "PassengerId": pd.read_csv("Titanic_Dataset.csv")["PassengerId"],
    "Survived": predictions
})

submission.to_csv("submission.csv", index=False)

In [None]:
!pip install optuna

In [None]:
!pip install -q tqdm

In [None]:
import optuna
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
    plot_slice
)

from tqdm.notebook import tqdm

#Progress bar callback class

class TQDMCallback:
    def __init__(self, n_trials):
        self.pbar = tqdm(total=n_trials, desc="Optuna Trials")

    def __call__(self, study, trial):
        self.pbar.update(1)


def objective(trial):
    # Split the train data for internal validation during tuning
    train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

    # Suggest hyperparameters dynamically with Optuna
    hyperparameters = {
        "GBM": {
            "num_boost_round": trial.suggest_int("gbm_num_boost_round", 100, 300),
            "learning_rate": trial.suggest_float("gbm_learning_rate", 0.01, 0.1, log=True),
        },
        "CAT": {
            "iterations": trial.suggest_int("cat_iterations", 100, 700),
            "learning_rate": trial.suggest_float("cat_learning_rate", 0.01, 0.1, log=True),
        },
        "XGB": {
            "n_estimators": trial.suggest_int("xgb_n_estimators", 100, 300),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 10),
        }
    }

    # Train ensemble with your hyperparameters, bagging, and stacking
    predictor = TabularPredictor(label="Survived", eval_metric="accuracy").fit(
        train_data=train_data,
        hyperparameters=hyperparameters,
        num_bag_folds=5,
        num_stack_levels=2,
        time_limit=1800,
        verbosity=0
    )

    # Evaluate on validation data
    perf = predictor.evaluate(val_data, silent=True)
    accuracy = perf['accuracy']
    return accuracy

n_trials = 10

    # Create or load Optuna study with SQLite storage
study = optuna.create_study(
    direction="maximize",
    study_name="titanic_opt",
    storage="sqlite:///titanic_opt.db",
    load_if_exists=True
)

# Optimize (you can increase n_trials)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

#Run progress bar
study.optimize(objective, n_trials= n_trials, callbacks=[TQDMCallback(n_trials)])

print("Best trial:")
print(study.best_trial.params)

# Plot visualizations
fig1 = plot_optimization_history(study).show()

fig2 = plot_param_importances(study).show()

fig3 = plot_parallel_coordinate(study).show()

fig4 = plot_slice(study).show()


In [None]:
#final model training

from autogluon.tabular import TabularPredictor
import pandas as pd

# Best hyperparameters from your Optuna study
final_hyperparameters = {
    "GBM": {
        "num_boost_round": 296,
        "learning_rate": 0.06317323932761781,
    },
    "CAT": {
        "iterations": 656,
        "learning_rate": 0.042104502245723874,
    },
    "XGB": {
        "n_estimators": 143,
        "max_depth": 9,
    }
}

# Retrain TabularPredictor on the full training data
predictor = TabularPredictor(label="Survived", eval_metric="accuracy").fit(
    train_data=train,  # full training set
    hyperparameters=final_hyperparameters,
    num_bag_folds=5,
    num_stack_levels=2,
    verbosity=2
)

# Make predictions on the test set
predictions = predictor.predict(test)

# Prepare submission file (assuming test has 'PassengerId')
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predictions
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("submission.csv file has been created!")


In [None]:
#Best parameters from Optuna. DO NOT RUN THIS CELL.

best_params = {
    'gbm_num_boost_round': 296,
    'gbm_learning_rate': 0.06317323932761781,
    'cat_iterations': 656,
    'cat_learning_rate': 0.042104502245723874,
    'xgb_n_estimators': 143,
    'xgb_max_depth': 9
}

In [None]:
#Compressed and trained model download

!zip -r AutogluonModels.zip AutogluonModels
from google.colab import files
files.download("AutogluonModels.zip")

In [None]:
#Upload of titanic_opt.db to Google drive

from google.colab import drive
drive.mount('/content/drive')

!cp titanic_opt.db /content/drive/MyDrive/