## Setup

In [48]:
import os
import sys
import pandas as pd
import boto3
import numpy as np
from sklearn.preprocessing import StandardScaler
from io import StringIO
from dotenv import load_dotenv

# Setup import depuis src/
sys.path.append("src")
from mlops_project.utils.load_csv import read_csv_from_s3

In [None]:
load_dotenv()

bucket = os.getenv("S3_BUCKET_NAME")
filename = os.getenv("CSV_FILENAME")
s3_key = f"datasets/{filename}"

df = read_csv_from_s3(bucket, s3_key)
print(f"‚úÖ Loaded dataset with shape: {df.shape}")

## Initial Cleanup

In [5]:
# Initial dataset shape
print(f"üîç Initial shape: {df.shape}")

üîç Initial shape: (891, 12)


In [6]:
# Drop duplicate rows
df = df.drop_duplicates()
print(f"üßπ After removing duplicates: {df.shape}")

üßπ After removing duplicates: (891, 12)


In [7]:
# Drop empty columns (100% missing values)
empty_cols = df.columns[df.isna().sum() == len(df)]
df = df.drop(columns=empty_cols)
print(f"üóëÔ∏è Dropped empty columns: {list(empty_cols)}")

üóëÔ∏è Dropped empty columns: []


In [8]:
# Drop constant columns (only one unique value)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
df = df.drop(columns=constant_cols)
print(f"üß∫ Dropped constant columns: {constant_cols}")

üß∫ Dropped constant columns: []


In [21]:
# Drop unique columns (as many unique values as rows)
unique_cols = [col for col in df.columns if df[col].nunique() == len(df)]
df = df.drop(columns=unique_cols)
print(f"üöÆ Dropped totally unique columns: {unique_cols}")

üöÆ Dropped totally unique columns: ['PassengerId', 'Name']


In [22]:
print(f"‚úÖ Cleaned shape: {df.shape}")

‚úÖ Cleaned shape: (891, 10)


## Handling Missing Values (numerical features)

In [23]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns

In [24]:
# Count missing values in numerical columns
missing_num = df[num_cols].isna().sum()
missing_num = missing_num[missing_num > 0]

print("üîç Numerical columns with missing values:")
print(missing_num)

üîç Numerical columns with missing values:
Series([], dtype: int64)


In [25]:
# Fill missing values with the median of each column
for col in missing_num.index:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)
    print(f"üß™ Filled NaNs in '{col}' with median: {median_value:.2f}")

## Handling Missing Values (categorical features)

In [26]:
# Select categorical columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns

In [27]:
# Count missing values in categorical columns
missing_cat = df[cat_cols].isna().sum()
missing_cat = missing_cat[missing_cat > 0]

print("üîç Categorical columns with missing values:")
print(missing_cat)

üîç Categorical columns with missing values:
Series([], dtype: int64)


In [28]:
# Fill missing values with the most frequent value (mode)
for col in missing_cat.index:
    most_common = df[col].mode()[0]
    df[col] = df[col].fillna(most_common)
    print(f"üìå Filled NaNs in '{col}' with most frequent value: '{most_common}'")

## Standardisation

In [31]:
# Select numerical columns
num_cols = df.select_dtypes(include=["number"]).columns

In [32]:
# Identify numeric columns with few unique values (discrete-like)
discrete_as_cat = [col for col in num_cols if df[col].nunique() <= 5]

In [33]:
# Final list of columns to scale = numeric columns - discrete ones
scale_cols = [col for col in num_cols if col not in discrete_as_cat]

In [34]:
print(f"üß† Treating as categorical (discrete numeric): {discrete_as_cat}")
print(f"üìè Standardizing columns: {scale_cols}")

üß† Treating as categorical (discrete numeric): ['Survived', 'Pclass']
üìè Standardizing columns: ['Age', 'SibSp', 'Parch', 'Fare']


In [37]:
# Apply StandardScaler only on selected columns
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[scale_cols])

In [38]:
# Replace in DataFrame
df[scale_cols] = scaled_data

In [39]:
print(f"‚úÖ Standardized {len(scale_cols)} columns.")

‚úÖ Standardized 4 columns.


## Encoding

In [40]:
# Get original categorical columns (object or category type)
base_cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

In [41]:
# Add discrete numeric columns identified as categorical
cat_cols = base_cat_cols + discrete_as_cat

In [42]:
print(f"üéØ One-hot encoding on columns: {cat_cols}")

üéØ One-hot encoding on columns: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'Survived', 'Pclass']


In [45]:
df_encoded = pd.get_dummies(df, columns=cat_cols)

In [46]:
print(f"‚úÖ Shape after encoding: {df_encoded.shape}")

‚úÖ Shape after encoding: (891, 842)


## Export

In [49]:
s3 = boto3.client("s3")
output_key = f"datasets/{filename.replace('.csv', '_processed.csv')}"

In [50]:
csv_buffer = StringIO()
df_encoded.to_csv(csv_buffer, index=False)

In [51]:
# Upload to S3
s3.put_object(
    Bucket=os.getenv("S3_BUCKET_NAME"),
    Key=output_key,
    Body=csv_buffer.getvalue()
)

print(f"‚úÖ Saved processed dataset to s3://{os.getenv('S3_BUCKET_NAME')}/{output_key}")

‚úÖ Saved processed dataset to s3://athos-mlops0-bucket/datasets/titanic_processed.csv
