## Setup

In [1]:
import os
import sys
import pandas as pd
import boto3
import numpy as np
from sklearn.preprocessing import StandardScaler
from io import StringIO
from dotenv import load_dotenv

sys.path.append("src")
from mlops_project.utils.s3_handler import S3Handler
from mlops_project.config.config_loader import load_config


In [2]:
load_dotenv()
config = load_config("../src/mlops_project/config/dev.yaml")

bucket = os.getenv("S3_BUCKET_NAME")
filename = os.getenv("CSV_FILENAME")
csv_url = os.getenv("CSV_URL")

csv_raw_key = f"datasets/{filename}_raw.csv"
csv_processed_key = f"datasets/{filename}_processed.csv"

TASK_TYPE = config['type']  # regression or classification
target = config['target']
id_column = config['id_column'] # None if no index column

In [3]:
s3handler = S3Handler(bucket)
s3handler.upload_csv_from_url_to_s3(csv_url, filename)
df = s3handler.load_csv_from_s3(csv_raw_key)
print(f"✅ Loaded dataset with shape: {df.shape}")

✅ CSV uploaded to s3://athos-mlops0-bucket/datasets/iris# do not put .csv extension_raw.csv
🌀 GZIP compression detected
✅ Loaded dataset with shape: (150, 5)


## Initial Cleanup

In [4]:
# Initial dataset shape
print(f"🔍 Initial shape: {df.shape}")

🔍 Initial shape: (150, 5)


In [5]:
# Drop duplicate rows
df = df.drop_duplicates()
print(f"🧹 After removing duplicates: {df.shape}")

🧹 After removing duplicates: (147, 5)


In [6]:
# Drop empty columns (100% missing values)
empty_cols = df.columns[df.isna().sum() == len(df)]
df = df.drop(columns=empty_cols)
print(f"🗑️ Dropped empty columns: {list(empty_cols)}")

🗑️ Dropped empty columns: []


In [7]:
# Drop constant columns (only one unique value)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
df = df.drop(columns=constant_cols)
print(f"🧺 Dropped constant columns: {constant_cols}")

🧺 Dropped constant columns: []


In [8]:
# Set ID column as index (if defined, unique, and non-null)
if id_column and id_column in df.columns:
    is_unique = df[id_column].is_unique
    has_no_nan = df[id_column].isna().sum() == 0

    if is_unique and has_no_nan:
        df.set_index(id_column, inplace=True)
        print(f"📎 Set '{id_column}' as index.")
    elif not is_unique:
        print(f"⚠️ ID column '{id_column}' is not unique – not set as index.")
    elif not has_no_nan:
        print(f"⚠️ ID column '{id_column}' contains missing values – not set as index.")

In [9]:
# Drop unique columns (as many unique values as rows)
unique_cols = [col for col in df.columns if df[col].nunique() == len(df)]
df = df.drop(columns=unique_cols)
print(f"🚮 Dropped totally unique columns: {unique_cols}")

🚮 Dropped totally unique columns: []


In [10]:
print(f"✅ Cleaned shape: {df.shape}")

✅ Cleaned shape: (147, 5)


## Handling Missing Values (numerical features)

In [11]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns

In [12]:
# Count missing values in numerical columns
missing_num = df[num_cols].isna().sum()
missing_num = missing_num[missing_num > 0]

print("🔍 Numerical columns with missing values:")
print(missing_num)

🔍 Numerical columns with missing values:
Series([], dtype: int64)


In [13]:
# Fill missing values with the median of each column
for col in missing_num.index:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)
    print(f"🧪 Filled NaNs in '{col}' with median: {median_value:.2f}")

## Handling Missing Values (categorical features)

In [14]:
# Select categorical columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns

In [15]:
# Count missing values in categorical columns
missing_cat = df[cat_cols].isna().sum()
missing_cat = missing_cat[missing_cat > 0]

print("🔍 Categorical columns with missing values:")
print(missing_cat)

🔍 Categorical columns with missing values:
Series([], dtype: int64)


In [16]:
# Fill missing values with the most frequent value (mode)
for col in missing_cat.index:
    most_common = df[col].mode()[0]
    df[col] = df[col].fillna(most_common)
    print(f"📌 Filled NaNs in '{col}' with most frequent value: '{most_common}'")

## Standardisation

In [17]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns
num_cols = [col for col in num_cols if col != target]

In [18]:
# Identify numeric columns with few unique values (discrete-like)
discrete_as_cat = [col for col in num_cols if df[col].nunique() <= 5]

In [19]:
# Final list of columns to scale = numeric columns - discrete ones
scale_cols = [col for col in num_cols if col not in discrete_as_cat]

In [20]:
print(f"🧠 Treating as categorical (discrete numeric): {discrete_as_cat}")
print(f"📏 Standardizing columns: {scale_cols}")

🧠 Treating as categorical (discrete numeric): []
📏 Standardizing columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


In [21]:
# Apply StandardScaler only on selected columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[scale_cols])

In [22]:
# Replace in DataFrame
df[scale_cols] = scaled_data

In [23]:
print(f"✅ Standardized {len(scale_cols)} columns.")

✅ Standardized 4 columns.


## Encoding

In [24]:
# Get original categorical columns (object or category type)
base_cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
base_cat_cols = [col for col in base_cat_cols if col != target]

In [25]:
# Add discrete numeric columns identified as categorical
cat_cols = base_cat_cols + discrete_as_cat

In [26]:
print(f"🎯 One-hot encoding on columns: {cat_cols}")

🎯 One-hot encoding on columns: []


In [27]:
df_encoded = pd.get_dummies(df, columns=cat_cols)

In [28]:
print(f"✅ Shape after encoding: {df_encoded.shape}")

✅ Shape after encoding: (147, 5)


In [29]:
df_encoded

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,-0.915509,1.019971,-1.357737,-1.335700,setosa
1,-1.157560,-0.128082,-1.357737,-1.335700,setosa
2,-1.399610,0.331139,-1.414778,-1.335700,setosa
3,-1.520635,0.101529,-1.300696,-1.335700,setosa
4,-1.036535,1.249582,-1.357737,-1.335700,setosa
...,...,...,...,...,...
145,1.020892,-0.128082,0.809831,1.444682,virginica
146,0.536792,-1.276136,0.695748,0.915085,virginica
147,0.778842,-0.128082,0.809831,1.047484,virginica
148,0.415766,0.790361,0.923913,1.444682,virginica


## Export

In [32]:
s3 = boto3.client("s3")
csv_buffer = StringIO()
df_encoded.to_csv(csv_buffer, index=True)

In [33]:
# Upload to S3
s3.put_object(
    Bucket=os.getenv("S3_BUCKET_NAME"),
    Key=csv_processed_key,
    Body=csv_buffer.getvalue()
)

print(f"✅ Saved processed dataset to s3://{os.getenv('S3_BUCKET_NAME')}/{csv_processed_key}")

✅ Saved processed dataset to s3://athos-mlops0-bucket/datasets/titanic_processed.csv
