In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [17]:
df = pd.read_csv("../dataset/carbon_emission_final_engineered.csv")
print("Original Shape:", df.shape)

# Drop helper/classification columns
drop_cols = ['co2_per_km', 'co2_per_litre', 'efficiency_score', 'high_emitter',
             'low_efficiency', 'co2_tier']

df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)


Original Shape: (2109, 27)


In [18]:
def add_more_real_world_noise(df):
    noisy_df = df.copy()
    np.random.seed(42)

    # Add stronger noise
    noisy_df['average_speed_in_km_per_hr'] *= np.random.normal(1.0, 0.12, size=len(df))
    noisy_df['no_of_stop'] += np.random.randint(-2, 3, size=len(df))
    noisy_df['load_factor'] *= np.random.normal(1.0, 0.08, size=len(df))
    noisy_df['adjusted_fuel_efficiency_in_km_litre'] *= np.random.normal(1.0, 0.18, size=len(df))
    noisy_df['adjusted_energy_kwh_per_km'] *= np.random.normal(1.0, 0.18, size=len(df))
    noisy_df['fuel_per_km'] *= np.random.normal(1.0, 0.12, size=len(df))
    noisy_df['total_energy_kwh'] *= np.random.normal(1.0, 0.10, size=len(df))

    # Inject slight noise into target as well
    noisy_df['c02_emission_kg'] *= np.random.normal(1.0, 0.05, size=len(df))

    # Clamp to realistic bounds
    noisy_df['load_factor'] = noisy_df['load_factor'].clip(0.1, 1.0)
    noisy_df['no_of_stop'] = noisy_df['no_of_stop'].clip(lower=0)
    noisy_df['adjusted_fuel_efficiency_in_km_litre'] = noisy_df['adjusted_fuel_efficiency_in_km_litre'].clip(lower=1.0)

    return noisy_df


# Apply the function
df = add_more_real_world_noise(df)


In [19]:
features = [
    'distance_in_km_per_route', 'vehicle_age_in_years', 'load_factor',
    'no_of_stop', 'average_speed_in_km_per_hr',
    'adjusted_fuel_efficiency_in_km_litre', 'adjusted_energy_kwh_per_km',
    'fuel_per_km', 'total_energy_kwh', 'speed_per_stop', 'load_utilization',
    'vehicle_type', 'fuel_type', 'traffic_condition', 'engine_norm_type',
    'logistics_partner', 'mode'
]

target = 'c02_emission_kg'

X = df[features]
y = df[target]


In [20]:
# Define preprocessing
categorical = ['vehicle_type', 'fuel_type', 'traffic_condition', 'engine_norm_type', 'logistics_partner', 'mode']
numerical = [col for col in X.columns if col not in categorical]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)


In [22]:
# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))  # Fixed
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


MAE: 21.71
RMSE: 74.34
R² Score: 0.9424
