## Setup

In [1]:
import os
import sys
import pandas as pd
import boto3
import numpy as np
from sklearn.preprocessing import StandardScaler
from io import StringIO
from dotenv import load_dotenv

sys.path.append("src")
from mlops_project.utils.s3_handler import S3Handler
from mlops_project.config.config_loader import load_config


In [4]:
load_dotenv()
config = load_config("../src/mlops_project/config/dev.yaml")

bucket = os.getenv("S3_BUCKET_NAME")
filename = os.getenv("CSV_FILENAME")
csv_url = os.getenv("CSV_URL")

csv_raw_key = f"datasets/{filename}_raw.csv"
csv_processed_key = f"datasets/{filename}_processed.csv"

TASK_TYPE = config['type']  # regression or classification
target = config['target']
id_column = config['id_column'] # None if no index column

In [5]:
s3handler = S3Handler(bucket)
s3handler.upload_csv_from_url_to_s3(csv_url, filename)
df = s3handler.load_csv_from_s3(csv_raw_key)
print(f"✅ Loaded dataset with shape: {df.shape}")

✅ CSV uploaded to s3://athos-mlops0-bucket/datasets/titanic_raw.csv
🌀 GZIP compression detected
✅ Loaded dataset with shape: (891, 12)


## Initial Cleanup

In [6]:
# Initial dataset shape
print(f"🔍 Initial shape: {df.shape}")

🔍 Initial shape: (891, 12)


In [7]:
# Drop duplicate rows
df = df.drop_duplicates()
print(f"🧹 After removing duplicates: {df.shape}")

🧹 After removing duplicates: (891, 12)


In [8]:
# Drop empty columns (100% missing values)
empty_cols = df.columns[df.isna().sum() == len(df)]
df = df.drop(columns=empty_cols)
print(f"🗑️ Dropped empty columns: {list(empty_cols)}")

🗑️ Dropped empty columns: []


In [9]:
# Drop constant columns (only one unique value)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
df = df.drop(columns=constant_cols)
print(f"🧺 Dropped constant columns: {constant_cols}")

🧺 Dropped constant columns: []


In [10]:
# Set ID column as index (if defined, unique, and non-null)
if id_column and id_column in df.columns:
    is_unique = df[id_column].is_unique
    has_no_nan = df[id_column].isna().sum() == 0

    if is_unique and has_no_nan:
        df.set_index(id_column, inplace=True)
        print(f"📎 Set '{id_column}' as index.")
    elif not is_unique:
        print(f"⚠️ ID column '{id_column}' is not unique – not set as index.")
    elif not has_no_nan:
        print(f"⚠️ ID column '{id_column}' contains missing values – not set as index.")

📎 Set 'PassengerId' as index.


In [11]:
# Drop unique columns (as many unique values as rows)
unique_cols = [col for col in df.columns if df[col].nunique() == len(df)]
df = df.drop(columns=unique_cols)
print(f"🚮 Dropped totally unique columns: {unique_cols}")

🚮 Dropped totally unique columns: ['Name']


In [12]:
print(f"✅ Cleaned shape: {df.shape}")

✅ Cleaned shape: (891, 10)


## Handling Missing Values (numerical features)

In [13]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns

In [14]:
# Count missing values in numerical columns
missing_num = df[num_cols].isna().sum()
missing_num = missing_num[missing_num > 0]

print("🔍 Numerical columns with missing values:")
print(missing_num)

🔍 Numerical columns with missing values:
Age    177
dtype: int64


In [15]:
# Fill missing values with the median of each column
for col in missing_num.index:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)
    print(f"🧪 Filled NaNs in '{col}' with median: {median_value:.2f}")

🧪 Filled NaNs in 'Age' with median: 28.00


## Handling Missing Values (categorical features)

In [16]:
# Select categorical columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns

In [17]:
# Count missing values in categorical columns
missing_cat = df[cat_cols].isna().sum()
missing_cat = missing_cat[missing_cat > 0]

print("🔍 Categorical columns with missing values:")
print(missing_cat)

🔍 Categorical columns with missing values:
Cabin       687
Embarked      2
dtype: int64


In [18]:
# Fill missing values with the most frequent value (mode)
for col in missing_cat.index:
    most_common = df[col].mode()[0]
    df[col] = df[col].fillna(most_common)
    print(f"📌 Filled NaNs in '{col}' with most frequent value: '{most_common}'")

📌 Filled NaNs in 'Cabin' with most frequent value: 'B96 B98'
📌 Filled NaNs in 'Embarked' with most frequent value: 'S'


## Standardisation

In [19]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns
num_cols = [col for col in num_cols if col != target]

In [20]:
# Identify numeric columns with few unique values (discrete-like)
discrete_as_cat = [col for col in num_cols if df[col].nunique() <= 5]

In [21]:
# Final list of columns to scale = numeric columns - discrete ones
scale_cols = [col for col in num_cols if col not in discrete_as_cat]

In [22]:
print(f"🧠 Treating as categorical (discrete numeric): {discrete_as_cat}")
print(f"📏 Standardizing columns: {scale_cols}")

🧠 Treating as categorical (discrete numeric): ['Pclass']
📏 Standardizing columns: ['Age', 'SibSp', 'Parch', 'Fare']


In [23]:
# Apply StandardScaler only on selected columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[scale_cols])

In [24]:
# Replace in DataFrame
df[scale_cols] = scaled_data

In [25]:
print(f"✅ Standardized {len(scale_cols)} columns.")

✅ Standardized 4 columns.


## Encoding

In [26]:
# Get original categorical columns (object or category type)
base_cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
base_cat_cols = [col for col in base_cat_cols if col != target]

In [27]:
# Add discrete numeric columns identified as categorical
cat_cols = base_cat_cols + discrete_as_cat

In [28]:
print(f"🎯 One-hot encoding on columns: {cat_cols}")

🎯 One-hot encoding on columns: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'Pclass']


In [29]:
df_encoded = pd.get_dummies(df, columns=cat_cols)

In [30]:
print(f"✅ Shape after encoding: {df_encoded.shape}")

✅ Shape after encoding: (891, 841)


In [31]:
df_encoded

Unnamed: 0_level_0,Survived,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,Ticket_110465,...,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,-0.565736,0.432793,-0.473674,-0.502445,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2,1,0.663861,0.432793,-0.473674,0.786845,True,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
3,1,-0.258337,-0.474545,-0.473674,-0.488854,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,1,0.433312,0.432793,-0.473674,0.420730,True,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
5,0,0.433312,-0.474545,-0.473674,-0.486337,False,True,False,False,False,...,False,False,False,False,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,-0.181487,-0.474545,-0.473674,-0.386671,False,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
888,1,-0.796286,-0.474545,-0.473674,-0.044381,True,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
889,0,-0.104637,0.432793,2.008933,-0.176263,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
890,1,-0.258337,-0.474545,-0.473674,-0.044381,False,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False


## Export

In [32]:
s3 = boto3.client("s3")
csv_buffer = StringIO()
df_encoded.to_csv(csv_buffer, index=True)

In [33]:
# Upload to S3
s3.put_object(
    Bucket=os.getenv("S3_BUCKET_NAME"),
    Key=csv_processed_key,
    Body=csv_buffer.getvalue()
)

print(f"✅ Saved processed dataset to s3://{os.getenv('S3_BUCKET_NAME')}/{csv_processed_key}")

✅ Saved processed dataset to s3://athos-mlops0-bucket/datasets/titanic_processed.csv
