## Setup

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

sys.path.append("src")
from mlops_project.utils.load_from_s3 import S3Loader

In [2]:
load_dotenv()

bucket = os.getenv("S3_BUCKET_NAME")
filename = os.getenv("CSV_FILENAME")
key = f"datasets/{filename}_processed.csv"

TASK_TYPE = "regression"  # regression or classification
target = "median_house_value"
id_column = None # None if no index column
seed = 42

s3loader = S3Loader(bucket)

 ## Dataset preparation

In [3]:
df = s3loader.load_csv_from_s3(key)

📄 Plain CSV detected


In [4]:
# Optionally set a column as index if it exists, is unique, and has no missing values
if id_column and id_column in df.columns:
    is_unique = df[id_column].is_unique
    has_no_nan = df[id_column].isna().sum() == 0

    if is_unique and has_no_nan:
        df.set_index(id_column, inplace=True)
        print(f"📎 Restored '{id_column}' as index.")
    elif not is_unique:
        print(f"⚠️ Column '{id_column}' is not unique – skipping index set.")
    elif not has_no_nan:
        print(f"⚠️ Column '{id_column}' contains missing values – skipping index set.")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,452600.0,False,False,False,True,False
1,1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,358500.0,False,False,False,True,False
2,2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,352100.0,False,False,False,True,False
3,3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,341300.0,False,False,False,True,False
4,4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,342200.0,False,False,False,True,False


In [6]:
X = df.drop(columns=[target])
y = df[target]

In [7]:
stratify = y if TASK_TYPE == "classification" else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=stratify
)

## Model Comparison

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.dummy import DummyClassifier, DummyRegressor

In [9]:
# Model Initialisation
models = {}

if TASK_TYPE == "classification":
    models = {
        "Baseline Model Classifier": DummyClassifier(strategy="most_frequent", random_state=seed),
        "RandomForestClassifier": RandomForestClassifier(random_state=seed),
        "LogisticRegression": LogisticRegression(max_iter=1000)
    }
    scoring = "accuracy"
    metric_func = accuracy_score

else:
    models = {
        "Baseline Model Regressor": DummyRegressor(strategy="mean"),
        "RandomForestRegressor": RandomForestRegressor(random_state=seed),
        "LinearRegression": LinearRegression()
    }
    scoring = "neg_root_mean_squared_error"
    metric_func = mean_squared_error

In [10]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metric_func(y_test, y_pred)
    results.append({"Model": name, "Score": score})

results_df = pd.DataFrame(results).sort_values(by="Score", ascending=(TASK_TYPE=="regression"))
results_df

Unnamed: 0,Model,Score
1,RandomForestRegressor,2207782000.0
2,LinearRegression,4898860000.0
0,Baseline Model Regressor,13106960000.0


## Model Optimisation

In [11]:
best_model_name = results_df.iloc[0]["Model"]
print(f"🏆 Best model: {best_model_name}")
selected_model = models[best_model_name]

🏆 Best model: RandomForestRegressor


In [12]:
# Tune Hyperparameters from Best Model
param_grid = {
    "n_estimators": [100, 200, 300],         # Number of trees in the forest
    "max_depth": [None, 10, 20],        # Minimum samples in a leaf node
    "max_features": ["sqrt", "log2"],          # Use bootstrapped samples or not
}


In [13]:
if param_grid:
    print(f"🔧 Running GridSearchCV for {best_model_name}")
    grid = GridSearchCV(selected_model, param_grid, scoring=scoring, cv=3)
    grid.fit(X_train, y_train)
    optimized_model = grid.best_estimator_
    print(f"✅ Best params: {grid.best_params_}")
else:
    print(f"ℹ️ No hyperparameters to tune for {best_model_name}")
    optimized_model = selected_model

🔧 Running GridSearchCV for RandomForestRegressor
✅ Best params: {'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 300}


In [14]:
# Evaluate base model (before tuning)
y_pred_base = selected_model.predict(X_test)
score_base = metric_func(y_test, y_pred_base)

# Evaluate tuned model (after GridSearchCV or fallback)
y_pred_tuned = optimized_model.predict(X_test)
score_tuned = metric_func(y_test, y_pred_tuned)

print(f"📊 {best_model_name} score BEFORE tuning: {score_base:.4f}")
print(f"📈 {best_model_name} score AFTER tuning:  {score_tuned:.4f}")

📊 RandomForestRegressor score BEFORE tuning: 2207781615.1166
📈 RandomForestRegressor score AFTER tuning:  2218724672.3538


In [15]:
# Select final model based on performance
if TASK_TYPE == "classification":
    keep_tuned = score_tuned > score_base
else:  # regression → lower score is better (e.g. RMSE)
    keep_tuned = score_tuned < score_base

final_model = optimized_model if keep_tuned else selected_model
print(f"✅ Final model selected: {'tuned' if keep_tuned else 'base'} version of {best_model_name}")

✅ Final model selected: base version of RandomForestRegressor


In [16]:
final_model

## Save Model -> S3

In [17]:
import pickle
from io import BytesIO
import boto3

In [18]:
# Serialize final model
buffer = BytesIO()
pickle.dump(final_model, buffer)
buffer.seek(0)

0

In [19]:
# Define destination key
model_key = f"models/{best_model_name.lower()}_final.pkl"

In [20]:
# Upload to S3
s3 = boto3.client("s3")
s3.put_object(Bucket=bucket, Key=model_key, Body=buffer.getvalue())

{'ResponseMetadata': {'RequestId': '45SFGNSJ201XX4EE',
  'HostId': 'vPBaD3FlT/oWfUaJaDKgUAi3jV2Bs60TQBjwZTxJikf9xIr9Hz46jrQI7YVGFLNZM4GgzGy4sHDt3Gg8z8S/ucKc+q1fMFRVf5XZmvo3DuI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'vPBaD3FlT/oWfUaJaDKgUAi3jV2Bs60TQBjwZTxJikf9xIr9Hz46jrQI7YVGFLNZM4GgzGy4sHDt3Gg8z8S/ucKc+q1fMFRVf5XZmvo3DuI=',
   'x-amz-request-id': '45SFGNSJ201XX4EE',
   'date': 'Mon, 21 Apr 2025 13:22:36 GMT',
   'x-amz-version-id': '7RnnpOpd64uXuIhfofxtWtIeOP4nwyJJ',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"f8689668d4c066a25f8c93e20916310f"',
   'x-amz-checksum-crc32': '/BxwYw==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"f8689668d4c066a25f8c93e20916310f"',
 'ChecksumCRC32': '/BxwYw==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256',
 'VersionId': '7RnnpOpd64uXuIhfofxtWtIeOP4nwyJJ'}