In [1]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
import joblib
import os

In [2]:
# Paths
RAW_DATA_PATH = "../data/raw/downloaded_file.csv"
ENCODER_SAVE_PATH = "../models/encoders.pkl"
CLEAN_DATA_PATH = "../data/processed/clean_train.csv"


In [3]:
df = pd.read_csv(RAW_DATA_PATH)
df.shape

(850, 13)

In [4]:
df.head()

Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,"Landfill Location (Lat, Long)",Landfill Capacity (Tons),Year
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,"22.4265, 77.4931",45575,2019
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,"22.4265, 77.4931",45575,2019
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,"22.4265, 77.4931",45575,2019


In [5]:
# --- Extract Landfill Latitude & Longitude ---
df[['Landfill Lat', 'Landfill Lon']] = (
    df['Landfill Location (Lat, Long)']
    .str.strip("()")
    .str.split(",", expand=True)
    .astype(float)
)

In [6]:
# Drop unused columns
df.drop(columns=['Landfill Name', 'Landfill Location (Lat, Long)'], inplace=True)

In [7]:
target = 'Recycling Rate (%)'
categorical_low = ['Waste Type', 'Disposal Method']  # One-hot encoding
categorical_high = ['City/District']  # Target encoding
numerical_cols = [
    'Waste Generated (Tons/Day)',
    'Population Density (People/km²)',
    'Municipal Efficiency Score (1-10)',
    'Cost of Waste Management (₹/Ton)',
    'Awareness Campaigns Count',
    'Landfill Capacity (Tons)',
    'Year',
    'Landfill Lat',
    'Landfill Lon'
]

In [None]:
# Target Encoding for high-cardinality categorical features
target_enc = TargetEncoder(cols=categorical_high)
df[categorical_high] = target_enc.fit_transform(df[categorical_high], df[target])

In [10]:
# One-Hot Encoding for low-cardinality categorical features
df = pd.get_dummies(df, columns=categorical_low, drop_first=True)

In [11]:
# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [12]:
# Save encoders for inference
joblib.dump({'target_enc': target_enc, 'scaler': scaler}, ENCODER_SAVE_PATH)

['../models/encoders.pkl']

In [13]:
# Save cleaned dataset
df.to_csv(CLEAN_DATA_PATH, index=False)

In [15]:
df.shape

(850, 18)

In [16]:
df.head()

Unnamed: 0,City/District,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year,Landfill Lat,Landfill Lon,Waste Type_E-Waste,Waste Type_Hazardous,Waste Type_Organic,Waste Type_Plastic,Disposal Method_Incineration,Disposal Method_Landfill,Disposal Method_Recycling
0,56.929277,0.483872,68,-0.34686,0.929612,0.217581,0.674989,-0.688562,-1.414214,0.086246,-0.793881,False,False,False,True,False,False,False
1,56.929277,-1.465258,56,-0.34686,-1.394418,-0.00036,0.345348,-0.688562,-1.414214,0.086246,-0.793881,False,False,True,False,False,False,False
2,56.929277,1.041074,53,-0.34686,0.348604,0.479424,0.510168,-0.688562,-1.414214,0.086246,-0.793881,True,False,False,False,True,False,False
3,56.929277,1.316444,56,-0.34686,-1.394418,-1.003829,0.674989,-0.688562,-1.414214,0.086246,-0.793881,False,False,False,False,False,True,False
4,56.929277,-0.082665,44,-0.34686,-0.232403,-0.437026,1.00463,-0.688562,-1.414214,0.086246,-0.793881,False,True,False,False,False,False,True
