In [1]:
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
import joblib

In [2]:
# Paths
RAW_DATA_PATH = "../data/raw/waste_data.csv"
ENCODER_SAVE_PATH = "../models/encoders.pkl"
CLEAN_DATA_PATH = "../data/processed/clean_train.csv"


In [3]:
df = pd.read_csv(RAW_DATA_PATH)
df.shape

(850, 13)

In [4]:
df.head()

Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,"Landfill Location (Lat, Long)",Landfill Capacity (Tons),Year
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,"22.4265, 77.4931",45575,2019
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,"22.4265, 77.4931",45575,2019
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,"22.4265, 77.4931",45575,2019


In [5]:
df.shape

(850, 13)

In [6]:
df.isnull().sum()

City/District                        0
Waste Type                           0
Waste Generated (Tons/Day)           0
Recycling Rate (%)                   0
Population Density (People/km²)      0
Municipal Efficiency Score (1-10)    0
Disposal Method                      0
Cost of Waste Management (₹/Ton)     0
Awareness Campaigns Count            0
Landfill Name                        0
Landfill Location (Lat, Long)        0
Landfill Capacity (Tons)             0
Year                                 0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   City/District                      850 non-null    object
 1   Waste Type                         850 non-null    object
 2   Waste Generated (Tons/Day)         850 non-null    int64 
 3   Recycling Rate (%)                 850 non-null    int64 
 4   Population Density (People/km²)    850 non-null    int64 
 5   Municipal Efficiency Score (1-10)  850 non-null    int64 
 6   Disposal Method                    850 non-null    object
 7   Cost of Waste Management (₹/Ton)   850 non-null    int64 
 8   Awareness Campaigns Count          850 non-null    int64 
 9   Landfill Name                      850 non-null    object
 10  Landfill Location (Lat, Long)      850 non-null    object
 11  Landfill Capacity (Tons)           850 non-null    int64 
 12  Year    

In [8]:
df.describe()

Unnamed: 0,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year
count,850.0,850.0,850.0,850.0,850.0,850.0,850.0,850.0
mean,5262.249412,57.076471,13489.705882,7.4,2778.458824,9.904706,58934.617647,2021.0
std,2786.984735,16.129994,6631.081494,1.722162,1276.32563,6.070772,19413.627292,1.415046
min,511.0,30.0,2335.0,5.0,503.0,0.0,22690.0,2019.0
25%,2865.75,43.0,7927.0,6.0,1647.5,5.0,45575.0,2020.0
50%,5283.0,56.0,12579.5,7.0,2853.0,10.0,61038.5,2021.0
75%,7757.25,71.0,19087.0,9.0,3855.0,15.0,71127.0,2022.0
max,9980.0,85.0,24032.0,10.0,4999.0,20.0,98646.0,2023.0


In [9]:
# Drop unused columns
df.drop(columns=['Landfill Name', 'Landfill Location (Lat, Long)'], inplace=True)

In [10]:
df.dtypes

City/District                        object
Waste Type                           object
Waste Generated (Tons/Day)            int64
Recycling Rate (%)                    int64
Population Density (People/km²)       int64
Municipal Efficiency Score (1-10)     int64
Disposal Method                      object
Cost of Waste Management (₹/Ton)      int64
Awareness Campaigns Count             int64
Landfill Capacity (Tons)              int64
Year                                  int64
dtype: object

In [11]:
df["City/District"].unique().size

34

In [12]:
df["Waste Type"].unique().size

5

In [13]:
df["Disposal Method"].unique().size

4

In [14]:
target = 'Recycling Rate (%)'
categorical_low = ['Waste Type', 'Disposal Method']  # One-hot encoding
categorical_high = ['City/District']  # Target encoding
numerical_cols = [
    'Waste Generated (Tons/Day)',
    'Population Density (People/km²)',
    'Municipal Efficiency Score (1-10)',
    'Cost of Waste Management (₹/Ton)',
    'Awareness Campaigns Count',
    'Landfill Capacity (Tons)',
    'Year'
]

In [15]:
# Target Encoding for high-cardinality categorical features
target_enc = TargetEncoder(cols=categorical_high)
df[categorical_high] = target_enc.fit_transform(df[categorical_high], df[target])

In [16]:
# One-Hot Encoding for low-cardinality categorical features
df = pd.get_dummies(df, columns=categorical_low, drop_first=True)

In [17]:
# Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [18]:
final_columns = df.columns.to_list()
final_columns.remove('Recycling Rate (%)')
final_columns

['City/District',
 'Waste Generated (Tons/Day)',
 'Population Density (People/km²)',
 'Municipal Efficiency Score (1-10)',
 'Cost of Waste Management (₹/Ton)',
 'Awareness Campaigns Count',
 'Landfill Capacity (Tons)',
 'Year',
 'Waste Type_E-Waste',
 'Waste Type_Hazardous',
 'Waste Type_Organic',
 'Waste Type_Plastic',
 'Disposal Method_Incineration',
 'Disposal Method_Landfill',
 'Disposal Method_Recycling']

In [19]:
# Save encoders for inference
joblib.dump({'target_enc': target_enc, 'scaler': scaler, "feature_names": final_columns}, ENCODER_SAVE_PATH)

['../models/encoders.pkl']

In [20]:
# Save cleaned dataset
df.to_csv(CLEAN_DATA_PATH, index=False)

In [21]:
df.shape

(850, 16)

In [22]:
df.head()

Unnamed: 0,City/District,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year,Waste Type_E-Waste,Waste Type_Hazardous,Waste Type_Organic,Waste Type_Plastic,Disposal Method_Incineration,Disposal Method_Landfill,Disposal Method_Recycling
0,56.929277,0.483872,68,-0.34686,0.929612,0.217581,0.674989,-0.688562,-1.414214,False,False,False,True,False,False,False
1,56.929277,-1.465258,56,-0.34686,-1.394418,-0.00036,0.345348,-0.688562,-1.414214,False,False,True,False,False,False,False
2,56.929277,1.041074,53,-0.34686,0.348604,0.479424,0.510168,-0.688562,-1.414214,True,False,False,False,True,False,False
3,56.929277,1.316444,56,-0.34686,-1.394418,-1.003829,0.674989,-0.688562,-1.414214,False,False,False,False,False,True,False
4,56.929277,-0.082665,44,-0.34686,-0.232403,-0.437026,1.00463,-0.688562,-1.414214,False,True,False,False,False,False,True
