## Setup

In [14]:
import os
import sys
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

sys.path.append("src")
from mlops_project.utils.load_from_s3 import S3Loader

In [31]:
load_dotenv()

bucket = os.getenv("S3_BUCKET_NAME")
filename = os.getenv("CSV_FILENAME").replace(".csv", "_processed.csv")
key = f"datasets/{filename}"

TASK_TYPE = "classification"  # or "regression"
target = "Survived"
id_column = "PassengerId"
seed = 42

s3loader = S3Loader(bucket)

 ## Dataset preparation

In [24]:
df = s3loader.load_csv_from_s3(key)

📄 Plain CSV detected


In [26]:
# Optionally set a column as index if it exists, is unique, and has no missing values
if id_column and id_column in df.columns:
    is_unique = df[id_column].is_unique
    has_no_nan = df[id_column].isna().sum() == 0

    if is_unique and has_no_nan:
        df.set_index(id_column, inplace=True)
        print(f"📎 Restored '{id_column}' as index.")
    elif not is_unique:
        print(f"⚠️ Column '{id_column}' is not unique – skipping index set.")
    elif not has_no_nan:
        print(f"⚠️ Column '{id_column}' contains missing values – skipping index set.")

📎 Restored 'PassengerId' as index.


In [27]:
df.head()

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,Ticket_110465,...,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,-0.565736,0.432793,-0.473674,-0.502445,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2,1,0.663861,0.432793,-0.473674,0.786845,True,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,1,-0.258337,-0.474545,-0.473674,-0.488854,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,1,0.433312,0.432793,-0.473674,0.42073,True,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
5,0,0.433312,-0.474545,-0.473674,-0.486337,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True


In [28]:
X = df.drop(columns=[target])
y = df[target]

In [32]:
stratify = y if TASK_TYPE == "classification" else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=stratify
)

## Model Comparison

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.dummy import DummyClassifier, DummyRegressor

In [38]:
# Model Initialisation
models = {}

if TASK_TYPE == "classification":
    models = {
        "Baseline Model": DummyClassifier(strategy="most_frequent", random_state=seed),
        "RandomForest": RandomForestClassifier(random_state=seed),
        "LogisticRegression": LogisticRegression(max_iter=1000)
    }
    scoring = "accuracy"
    metric_func = accuracy_score

else:
    models = {
        "Baseline Model": DummyRegressor(strategy="mean"),
        "RandomForest": RandomForestRegressor(random_state=seed),
        "LinearRegression": LinearRegression()
    }
    scoring = "neg_root_mean_squared_error"
    metric_func = mean_squared_error

In [39]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metric_func(y_test, y_pred)
    results.append({"Model": name, "Score": score})

results_df = pd.DataFrame(results).sort_values(by="Score", ascending=(TASK_TYPE=="regression"))
results_df

Unnamed: 0,Model,Score
2,LogisticRegression,0.821229
1,RandomForest,0.810056
0,Baseline,0.614525


## Model Optimisation

In [49]:
best_model_name = results_df.iloc[0]["Model"]
print(f"🏆 Best model: {best_model_name}")
selected_model = models[best_model_name]

🏆 Best model: LogisticRegression


In [58]:
# Tune Hyperparameters from Best Model
param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0, 100.0],
    "solver": ["lbfgs", "liblinear", "saga"],
    "max_iter": [100, 300, 1000]
}


In [60]:
if param_grid:
    print(f"🔧 Running GridSearchCV for {best_model_name}")
    grid = GridSearchCV(selected_model, param_grid, scoring=scoring, cv=3)
    grid.fit(X_train, y_train)
    optimized_model = grid.best_estimator_
    print(f"✅ Best params: {grid.best_params_}")
else:
    print(f"ℹ️ No hyperparameters to tune for {best_model_name}")
    optimized_model = selected_model

🔧 Running GridSearchCV for LogisticRegression




✅ Best params: {'C': 100.0, 'max_iter': 100, 'solver': 'lbfgs'}


In [61]:
# Evaluate base model (before tuning)
selected_model.fit(X_train, y_train)
y_pred_base = selected_model.predict(X_test)
score_base = metric_func(y_test, y_pred_base)

# Evaluate tuned model (after GridSearchCV or fallback)
y_pred_tuned = optimized_model.predict(X_test)
score_tuned = metric_func(y_test, y_pred_tuned)

print(f"📊 {best_model_name} score BEFORE tuning: {score_base:.4f}")
print(f"📈 {best_model_name} score AFTER tuning:  {score_tuned:.4f}")

📊 LogisticRegression score BEFORE tuning: 0.8212
📈 LogisticRegression score AFTER tuning:  0.8101


In [70]:
# Select final model based on performance
if TASK_TYPE == "classification":
    keep_tuned = score_tuned > score_base
else:  # regression → lower score is better (e.g. RMSE)
    keep_tuned = score_tuned < score_base

final_model = optimized_model if keep_tuned else selected_model
print(f"✅ Final model selected: {'tuned' if keep_tuned else 'base'} version of {best_model_name}")

✅ Final model selected: base version of LogisticRegression


In [71]:
final_model

## Save Model -> S3

In [76]:
import pickle
from io import BytesIO
import boto3

In [77]:
# Serialize final model
buffer = BytesIO()
pickle.dump(final_model, buffer)
buffer.seek(0)

0

In [78]:
# Define destination key
model_key = f"models/{best_model_name.lower()}_final.pkl"

In [79]:
# Upload to S3
s3 = boto3.client("s3")
s3.put_object(Bucket=bucket, Key=model_key, Body=buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'QHDBB3KF40DA9XTK',
  'HostId': 'fsPnFVAc9mA2tA+6Q/7D2BHc80I4Q5UvWVkRaTGD+eowP6YAK6LqNP+8mbScr4QdjPWbfeVE6gM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fsPnFVAc9mA2tA+6Q/7D2BHc80I4Q5UvWVkRaTGD+eowP6YAK6LqNP+8mbScr4QdjPWbfeVE6gM=',
   'x-amz-request-id': 'QHDBB3KF40DA9XTK',
   'date': 'Mon, 21 Apr 2025 11:18:17 GMT',
   'x-amz-version-id': 'WcMO1dY4ywF1vbGBthyTqutLu1pkQX1u',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"8ad061e4dfae74fc2c4cd294d05749b3"',
   'x-amz-checksum-crc32': 'WJ9kow==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"8ad061e4dfae74fc2c4cd294d05749b3"',
 'ChecksumCRC32': 'WJ9kow==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256',
 'VersionId': 'WcMO1dY4ywF1vbGBthyTqutLu1pkQX1u'}