In [3]:
# 1. IMPORT LIBRARIES
# ============================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib


In [4]:
file_path = "../data/processed/waste_management_processed.csv"
df = pd.read_csv(file_path)

print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (850, 50)


Unnamed: 0,Waste Generated (Tons/Day),Recycling Rate (%),Population Density (People/km²),Municipal Efficiency Score (1-10),Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year,Latitude,Longitude,...,City/District_Vadodara,City/District_Varanasi,City/District_Visakhapatnam,Waste Type_E-Waste,Waste Type_Hazardous,Waste Type_Organic,Waste Type_Plastic,Disposal Method_Incineration,Disposal Method_Landfill,Disposal Method_Recycling
0,6610,68,11191,9,3056,14,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,True,False,False,False
1,1181,56,11191,5,2778,12,45575,2019,22.4265,77.4931,...,False,False,False,False,False,True,False,False,False,False
2,8162,53,11191,8,3390,13,45575,2019,22.4265,77.4931,...,False,False,False,True,False,False,False,True,False,False
3,8929,56,11191,5,1498,14,45575,2019,22.4265,77.4931,...,False,False,False,False,False,False,False,False,True,False
4,5032,44,11191,7,2221,16,45575,2019,22.4265,77.4931,...,False,False,False,False,True,False,False,False,False,True


In [5]:
# 3. SPLIT FEATURES & TARGET
# ============================================
target = "Recycling Rate (%)"
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (680, 49), Test shape: (170, 49)


In [6]:
# 4. DEFINE MODELS
# ============================================
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
}

In [8]:
# 5. TRAIN & EVALUATE MODELS
# ============================================
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.4f}")

LinearRegression RMSE: 17.3203
RandomForest RMSE: 17.3758
XGBoost RMSE: 19.3053
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 679
[LightGBM] [Info] Number of data points in the train set: 680, number of used features: 35
[LightGBM] [Info] Start training from score 57.329412
LightGBM RMSE: 19.3761


In [9]:
# 6. PICK BEST MODEL
# ============================================
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with RMSE {results[best_model_name]:.4f}")



Best Model: LinearRegression with RMSE 17.3203


In [10]:
# 7. SAVE BEST MODEL
# ============================================
joblib.dump(best_model, "../models/trained_model.pkl")
print("Best model saved to ../models/trained_model.pkl")


Best model saved to ../models/trained_model.pkl


In [11]:
# 8. SAVE PREDICTIONS
# ============================================
y_pred_final = best_model.predict(X_test)
predictions_df = pd.DataFrame({
    "Actual": y_test,
    "Predicted": y_pred_final
})
predictions_df.to_csv("../predictions.csv", index=False)
print("Predictions saved to ../predictions.csv")

Predictions saved to ../predictions.csv


In [12]:
import json
with open("metrics.json", "w") as f:
    json.dump(results, f)
print("Metrics saved to metrics.json")


Metrics saved to metrics.json
