In [10]:
# 1. IMPORT LIBRARIES
# ============================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [13]:
# 2. LOAD RAW DATA
# ============================================
file_path = "https://raw.githubusercontent.com/MasteriNeuron/datasets/refs/heads/main/Waste_Management_and_Recycling_India.csv"  
df = pd.read_csv("https://raw.githubusercontent.com/MasteriNeuron/datasets/refs/heads/main/Waste_Management_and_Recycling_India.csv")

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (850, 13)


Unnamed: 0,City/District,Waste Type,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Name,"Landfill Location (Lat, Long)",Landfill Capacity (Tons),Year
0,Mumbai,Plastic,6610,68,11191,9,Composting,3056,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
1,Mumbai,Organic,1181,56,11191,5,Composting,2778,12,Mumbai Landfill,"22.4265, 77.4931",45575,2019
2,Mumbai,E-Waste,8162,53,11191,8,Incineration,3390,13,Mumbai Landfill,"22.4265, 77.4931",45575,2019
3,Mumbai,Construction,8929,56,11191,5,Landfill,1498,14,Mumbai Landfill,"22.4265, 77.4931",45575,2019
4,Mumbai,Hazardous,5032,44,11191,7,Recycling,2221,16,Mumbai Landfill,"22.4265, 77.4931",45575,2019


In [14]:
# 3. DROP USELESS TEXT COLUMNS
# ============================================
if 'Landfill Name' in df.columns:
    df.drop(columns=['Landfill Name'], inplace=True)


In [15]:
# 4. SPLIT LAT/LONG
# ============================================
if 'Landfill Location (Lat, Long)' in df.columns:
    lat_lon = df['Landfill Location (Lat, Long)'].str.split(",", expand=True)
    df['Latitude'] = lat_lon[0].astype(float)
    df['Longitude'] = lat_lon[1].astype(float)
    df.drop(columns=['Landfill Location (Lat, Long)'], inplace=True)

In [16]:
# 5. HANDLE CATEGORICAL COLUMNS AUTOMATICALLY
# ============================================
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

if cat_cols:
    print(f"Encoding categorical columns: {cat_cols}")
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

Encoding categorical columns: ['City/District', 'Waste Type', 'Disposal Method']


In [17]:
# 6. HANDLE MISSING VALUES
# ============================================
df.fillna(0, inplace=True)


In [19]:
#7. SAVE PROCESSED DATA
# ============================================
processed_path = "../data/processed/waste_management_processed.csv"
df.to_csv(processed_path, index=False)
print(f"\nProcessed dataset saved to {processed_path}")
print(f"Processed shape: {df.shape}")


Processed dataset saved to ../data/processed/waste_management_processed.csv
Processed shape: (850, 50)


In [20]:
# 8. FINAL CHECK
# ============================================
print("\nRemaining object columns:", df.select_dtypes(include=['object']).columns.tolist())


Remaining object columns: []
