In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

In [None]:
#READING CLEANED EXCEL FILE
df_cleaned = pd.read_csv("../dataset/carbon_emission_cleaned.csv")

DATASET OVERVIEW

In [None]:
print("Dataset Overview")
print(f"Total records: {df_cleaned.shape[0]}")
print(f"Total features: {df_cleaned.shape[1]}")
print("\nCategorical columns:", list(df_cleaned.select_dtypes(include='category').columns))
print("Numerical columns:", list(df_cleaned.select_dtypes(include='number').columns))
print("\nMissing values:\n", df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0])


MISSING VALUE HEATMAP

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap="Reds", yticklabels=False)
plt.title("Missing Values - Before Handling")
plt.tight_layout()
plt.show()


In [None]:
#missing value count and percentage
missing_count = df_cleaned.isnull().sum()
missing_percent = (missing_count / len(df_cleaned)) * 100
missing_df = pd.DataFrame({'Missing Count': missing_count, 'Missing %': missing_percent})
print("Missing Value Overview:\n", missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False))




In [None]:
# For numerical columns: Fill with median (robust to outliers)
num_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    if df_cleaned[col].isnull().sum() > 0:
        median_val = df_cleaned[col].median()
        df_cleaned[col].fillna(median_val, inplace=True)

In [None]:
#CATEGORICAL FEATURES
categorical_cols = ['mode', 'logistics_partner', 'vehicle_type', 'fuel_type', 'traffic_condition','engine_norm_type']
for col in categorical_cols:
    df_cleaned[col] = df_cleaned[col].astype('category')

In [None]:
# For categorical columns: Fill with mode
cat_cols = df_cleaned.select_dtypes(include='category').columns
for col in cat_cols:
    if df_cleaned[col].isnull().sum() > 0:
        mode_val = df_cleaned[col].mode()[0]
        df_cleaned[col].fillna(mode_val, inplace=True)

In [None]:
print("\nRemaining missing values after handling:")
print(df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0])

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap="Greens", yticklabels=False)
plt.title("Missing Values - After Handling")
plt.tight_layout()
plt.show()


In [None]:
# OVERALL DESCRIBING DATA
print("Shape of dataset:", df_cleaned.shape)
print("Data types:\n", df_cleaned.dtypes)

VISUALIZING DATA

In [None]:
df_cleaned['fuel_type'].value_counts().plot.pie(autopct='%1.1f%%', figsize=(6,6), title='Fuel Type Distribution')
plt.ylabel("")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df_cleaned, x='vehicle_type', order=df_cleaned['vehicle_type'].value_counts().index, palette='Set3')
plt.title("Vehicle Type Distribution")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6, 6))
df_cleaned['mode'].value_counts().plot.pie(autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title("Transport Mode Distribution")
plt.ylabel("")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df_cleaned, x='logistics_partner', order=df_cleaned['logistics_partner'].value_counts().index, palette='coolwarm')
plt.title("Logistics Partner Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df_cleaned, x='traffic_condition', order=df_cleaned['traffic_condition'].value_counts().index, palette='Set2')
plt.title("Traffic Condition Distribution")
plt.tight_layout()
plt.show()


In [None]:
if 'engine_norm_type' in df_cleaned.columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df_cleaned, x='engine_norm_type', order=df_cleaned['engine_norm_type'].value_counts().index, palette='cubehelix')
    plt.title("Engine Norm Type Distribution")
    plt.tight_layout()
    plt.show()


UNIVARIATE ANALYSIS

In [None]:
#CATEGORICAL FEATURES
categorical_cols = ['mode', 'logistics_partner', 'vehicle_type', 'fuel_type', 'traffic_condition']
for col in categorical_cols:
    df_cleaned[col] = df_cleaned[col].astype('category')

In [None]:
#PRINTING NUMERICAL AND CATEGORICAL DATA
print("\nNumerical Summary:\n", df_cleaned.describe())
print("\nCategorical Summary:\n", df_cleaned.describe(include='category'))

In [None]:
#COUNTING CATEGORICAL DATA
for col in df_cleaned.select_dtypes(include='category').columns:
    print(f"\nValue counts for {col}:\n{df_cleaned[col].value_counts()}")

In [None]:
# Histogram for numerical columns
numeric_cols = df_cleaned.select_dtypes(include=['float64', 'int64']).columns

for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(data=df_cleaned, x=col, kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()


In [None]:
#COUNTPLOT FOR CATEGORICAL COLUMNS
for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    sns.countplot(data=df_cleaned, x=col, order=df_cleaned[col].value_counts().index)
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_cleaned.select_dtypes(include='number').corr(), annot=True, fmt='.2f', cmap='RdBu_r')
plt.title("Correlation Heatmap - All Numerical Features")
plt.tight_layout()
plt.show()


BIVARIATE ANALYSIS

NUMERICAL VS NUMERICAL

In [None]:
#RELATIONSHIP BETWEEN DISTANCE AND CO2
sns.scatterplot(data=df_cleaned, x='distance_in_km_per_route', y='c02_emission_kg')
plt.title("Distance vs CO2 Emission")
plt.tight_layout()
plt.show()


In [None]:
#REALTIONSHIP BETWEEN DISTANCE AND FUEL
sns.scatterplot(data=df_cleaned, x='distance_in_km_per_route', y='fuel_consumption_in_litre')
plt.title("Distance vs Fuel Consumption")
plt.tight_layout()
plt.show()


In [None]:
#RELATIONSHIP BETWEEN FUEL AND CO2
sns.lmplot(data=df_cleaned, x='fuel_consumption_in_litre', y='c02_emission_kg', aspect=2, height=5)
plt.title("Fuel Consumption vs CO₂ Emission")
plt.tight_layout()
plt.show()


In [None]:
#RELATIONSHIP BETWEEN VEHICLE AGE AND CO2
sns.scatterplot(data=df_cleaned, x='vehicle_age_in_years', y='c02_emission_kg')
plt.title("Vehicle Age vs CO₂ Emission")
plt.tight_layout()
plt.show()


In [None]:
#RELATIONSHIP BETWEEN FUEL EFFICIENCY AND SPEED
sns.scatterplot(data=df_cleaned, x='average_speed_in_km_per_hr', y='adjusted_fuel_efficiency_in_km_litre')
plt.title("Speed vs Fuel Efficiency")
plt.tight_layout()
plt.show()


In [None]:
#RELATIONSHIP BETWEEN LOAD FACTOR AND FUEL CONSUMPTION
sns.scatterplot(data=df_cleaned, x='load_factor', y='fuel_consumption_in_litre')
plt.title("Load Factor vs Fuel Consumption")
plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(df_cleaned[[
    'distance_in_km_per_route',
    'fuel_consumption_in_litre',
    'c02_emission_kg',
    'load_factor',
    'vehicle_age_in_years'
]])
plt.show()


CATEGORICAL VS CATEGORICAL

In [None]:
cross_tab = pd.crosstab(df_cleaned['vehicle_type'], df_cleaned['fuel_type'])
print("\nCross-tabulation of Vehicle Type vs Fuel Type:\n")
print(cross_tab)

#Heatmap for visualizing cross-tab result
plt.figure(figsize=(8, 6))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Vehicle Type vs Fuel Type")
plt.xlabel("Fuel Type")
plt.ylabel("Vehicle Type")
plt.tight_layout()
plt.show()

In [None]:
cross_tab = pd.crosstab(df_cleaned['fuel_type'], df_cleaned['mode'])
print("\nCross-tabulation of Fuel Type vs Mode:\n")
print(cross_tab)

#Heatmap for visualizing cross-tab result
plt.figure(figsize=(8, 6))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Fuel Type vs Mode")
plt.xlabel("Fuel Type")
plt.ylabel("Mode")
plt.tight_layout()
plt.show()

In [None]:
cross_tab = pd.crosstab(df_cleaned['logistics_partner'], df_cleaned['mode'])
print("\nCross-tabulation of Logistics Partner vs Mode:\n")
print(cross_tab)

#Heatmap for visualizing cross-tab result
plt.figure(figsize=(8, 6))
sns.heatmap(cross_tab, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Logistics Partner vs Mode")
plt.xlabel("Logistics Partner")
plt.ylabel("Mode")
plt.tight_layout()
plt.show()

NUMERICAL VS CATEGORICAL

In [None]:
categorical_cols = ['mode', 'logistics_partner', 'vehicle_type', 'fuel_type', 'traffic_condition']
numerical_cols = ['distance_in_km_per_route', 'fuel_consumption_in_litre', 
                  'c02_emission_kg', 'load_factor', 'vehicle_age_in_years']
for col in categorical_cols:
    df_cleaned[col] = df_cleaned[col].astype('category')

In [None]:
#Box Plot: For each numerical vs categorical
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=cat_col, y=num_col, data=df_cleaned)
        plt.title(f'{num_col} by {cat_col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
#Violin Plots
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        plt.figure(figsize=(10, 5))
        sns.violinplot(x=cat_col, y=num_col, data=df_cleaned)
        plt.title(f'{num_col} distribution by {cat_col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
#Bar Plots (Mean value comparisons)
for cat_col in categorical_cols:
    for num_col in numerical_cols:
        plt.figure(figsize=(10, 5))
        sns.barplot(x=cat_col, y=num_col, data=df_cleaned, estimator='mean')
        plt.title(f'Mean {num_col} by {cat_col}')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

DETECTING OUTLIERS

In [None]:
numeric_cols = df_cleaned.select_dtypes(include='number').columns
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df_cleaned[col])
    plt.title(f'Boxplot of {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()

In [None]:
numeric_df = df_cleaned.select_dtypes(include='number')
z_scores = np.abs(zscore(numeric_df, nan_policy='omit'))
threshold = 3
outlier_mask = (z_scores > threshold)
outlier_counts = outlier_mask.sum(axis=0)

print(" Outlier count per column using Z-score method:")
print(outlier_counts)


FEATURE ENGINEERING

In [None]:
df_cleaned['load_utilization'] = df_cleaned['load_capacity_in_kg'] / df_cleaned['vehicle_capacity_in_kg']
df_cleaned['load_utilization'] = df_cleaned['load_utilization'].clip(upper=1.0)  # Load cannot exceed 100%

# Fuel consumption per km
df_cleaned['fuel_per_km'] = df_cleaned['fuel_consumption_in_litre'] / df_cleaned['distance_in_km_per_route']
df_cleaned['fuel_per_km'] = df_cleaned['fuel_per_km'].replace([np.inf, -np.inf], np.nan)

# CO2 emission per km
df_cleaned['co2_per_km'] = df_cleaned['c02_emission_kg'] / df_cleaned['distance_in_km_per_route']
df_cleaned['co2_per_km'] = df_cleaned['co2_per_km'].replace([np.inf, -np.inf], np.nan)

# CO2 emission per litre of fuel (for non-electric only)
df_cleaned['co2_per_litre'] = df_cleaned['c02_emission_kg'] / df_cleaned['fuel_consumption_in_litre']
df_cleaned['co2_per_litre'] = df_cleaned['co2_per_litre'].replace([np.inf, -np.inf], np.nan)

# Speed per stop — might indicate operational efficiency
df_cleaned['speed_per_stop'] = df_cleaned['average_speed_in_km_per_hr'] / df_cleaned['no_of_stop']
df_cleaned['speed_per_stop'] = df_cleaned['speed_per_stop'].replace([np.inf, -np.inf], np.nan)

print("\nFeature engineering complete. New columns added:")
print(['load_utilization', 'fuel_per_km', 'co2_per_km', 'co2_per_litre', 'speed_per_stop'])
df_cleaned[['load_utilization', 'fuel_per_km', 'co2_per_km', 'co2_per_litre', 'speed_per_stop']].describe()


In [None]:
engineered_features = ['load_utilization', 'fuel_per_km', 'co2_per_km', 'co2_per_litre', 'speed_per_stop']

for feature in engineered_features:
    plt.figure(figsize=(10, 4))
    sns.histplot(df_cleaned[feature], bins=30, kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()


In [None]:
missing_engineered = df_cleaned[engineered_features].isnull().sum()
print("Missing Values in Engineered Features:\n", missing_engineered[missing_engineered > 0])


In [None]:
# load utilization by vehicle type
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_cleaned, x='vehicle_type', y='load_utilization')
plt.xticks(rotation=45)
plt.title("Load Utilization by Vehicle Type")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_cleaned, x='fuel_type', y='co2_per_litre')
plt.xticks(rotation=45)
plt.title("CO2 per Litre by Fuel Type")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_cleaned, x='engine_norm_type', y='co2_per_litre')
plt.xticks(rotation=45)
plt.title("CO2 per Litre by Engine Norm Type")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_cleaned, x='vehicle_type', y='co2_per_km')
plt.xticks(rotation=45)
plt.title("CO2 per km by Vehicle Type")
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df_cleaned, x='traffic_condition', y='co2_per_km')
plt.xticks(rotation=45)
plt.title("CO2 per km by traffic_condition Type")
plt.show()


In [None]:
sns.scatterplot(data=df_cleaned, x='fuel_per_km', y='co2_per_km')
plt.title("CO2 vs Fuel Consumption per km")
plt.show()


In [None]:
sns.scatterplot(data=df_cleaned, x='speed_per_stop', y='co2_per_km')
plt.title("CO2 vs Speed per stop")
plt.show()


In [None]:
df_cleaned.groupby('fuel_type')[['fuel_per_km', 'co2_per_km', 'co2_per_litre']].mean().sort_values(by='fuel_per_km')


In [None]:
df_summary = df_cleaned.groupby('fuel_type')[['fuel_per_km', 'co2_per_km', 'co2_per_litre']].mean()

df_summary.plot(kind='bar', figsize=(10,6))
plt.title("Fuel Efficiency & Emissions by Fuel Type")
plt.ylabel("Values")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
corr = df_cleaned[['fuel_per_km', 'co2_per_km', 'co2_per_litre', 'load_utilization', 'speed_per_stop']].corr()

sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Engineered Features")
plt.show()


In [None]:
df_cleaned.groupby('vehicle_type')[['fuel_per_km', 'co2_per_km']].mean().sort_values('fuel_per_km')


In [None]:
df_vehicle = df_cleaned.groupby('vehicle_type')[['fuel_per_km', 'co2_per_km']].mean().sort_values('fuel_per_km')

df_vehicle.plot(kind='bar', figsize=(10,6), color=['skyblue', 'salmon'])
plt.title("Fuel & CO2 Efficiency by Vehicle Type")
plt.ylabel("Average Value per km")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
sns.jointplot(data=df_cleaned, x='fuel_per_km', y='co2_per_km', kind='hex', cmap='coolwarm')
plt.suptitle("Joint Distribution: Fuel per km vs CO2 per km", y=1.02)
plt.show()


In [None]:
sns.lmplot(data=df_cleaned, x='fuel_per_km', y='co2_per_km', aspect=2)
plt.title("Fuel per km vs CO₂ per km (with Trend Line)")
plt.tight_layout()
plt.show()


MULTIVARIATE ANALYSIS

In [None]:
sns.pairplot(df_cleaned[[
    'fuel_per_km', 'co2_per_km', 'co2_per_litre', 'speed_per_stop', 'load_utilization', 'fuel_type'
]], hue='fuel_type', palette='Set2')
plt.suptitle("Pairplot Colored by Fuel Type", y=1.02)
plt.show()


COMPARISON ORIGINAL VS ENGINEERED FEATURES

In [None]:
fuel_compare = df_cleaned.groupby('vehicle_type')[
    ['fuel_consumption_in_litre', 'fuel_per_km']
].mean().sort_values('fuel_per_km')
ax = fuel_compare['fuel_consumption_in_litre'].plot(kind='bar', figsize=(10,6), color='skyblue', position=0, width=0.4, label='Fuel Consumption (L)')
fuel_compare['fuel_per_km'].plot(kind='bar', ax=ax, color='salmon', position=1, width=0.4, secondary_y=True, label='Fuel per Km')

ax.set_title("Original Fuel Consumption vs Fuel per Km by Vehicle Type")
ax.set_ylabel("Fuel Consumption (L)")
ax.right_ax.set_ylabel("Fuel per Km")
ax.set_xticklabels(fuel_compare.index, rotation=45)
ax.legend(loc='upper left')
ax.right_ax.legend(loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
co2_compare = df_cleaned.groupby('vehicle_type')[
    ['c02_emission_kg', 'co2_per_km']
].mean().sort_values('co2_per_km')
ax = co2_compare['c02_emission_kg'].plot(kind='bar', figsize=(10,6), color='teal', position=0, width=0.4, label='CO₂ Emission (kg)')
co2_compare['co2_per_km'].plot(kind='bar', ax=ax, color='coral', position=1, width=0.4, secondary_y=True, label='CO₂ per Km')

ax.set_title("Original CO₂ Emission vs CO₂ per Km by Vehicle Type")
ax.set_ylabel("CO₂ Emission (kg)")
ax.right_ax.set_ylabel("CO₂ per Km")
ax.set_xticklabels(co2_compare.index, rotation=45)
ax.legend(loc='upper left')
ax.right_ax.legend(loc='upper right')

plt.tight_layout()
plt.show()


SUMMARIZING TOP 5 BOTTOM 5

In [None]:
print("Top 5 Efficient Vehicles (fuel_per_km):")
print(df_cleaned.sort_values('fuel_per_km').head(5)[['vehicle_type', 'fuel_type', 'fuel_per_km']])

print("\nLeast Efficient Vehicles (fuel_per_km):")
print(df_cleaned.sort_values('fuel_per_km', ascending=False).head(5)[['vehicle_type', 'fuel_type', 'fuel_per_km']])


In [None]:
print("Top 5 Efficient Vehicles (co2_per_km):")
print(df_cleaned.sort_values('co2_per_km').head(5)[['vehicle_type', 'fuel_type', 'co2_per_km']])

print("\nLeast Efficient Vehicles (co2_per_km):")
print(df_cleaned.sort_values('co2_per_km', ascending=False).head(5)[['vehicle_type', 'fuel_type', 'co2_per_km']])


In [None]:
for cat_col in categorical_cols:
    grouped = df_cleaned.groupby(cat_col)[engineered_features].mean()
    plt.figure(figsize=(10, 5))
    sns.heatmap(grouped, annot=True, cmap="YlGnBu")
    plt.title(f"Mean Engineered Features by {cat_col}")
    plt.tight_layout()
    plt.show()


In [None]:
# Top 5
top5 = df_cleaned[['vehicle_type', 'fuel_type', 'co2_per_km']].sort_values('co2_per_km').head(5)
top5.plot(kind='bar', x='vehicle_type', y='co2_per_km', color='green', title='Top 5 Efficient Vehicles (CO₂ per Km)')
plt.ylabel("CO₂ per Km")
plt.tight_layout()
plt.show()

# Bottom 5
bottom5 = df_cleaned[['vehicle_type', 'fuel_type', 'co2_per_km']].sort_values('co2_per_km', ascending=False).head(5)
bottom5.plot(kind='bar', x='vehicle_type', y='co2_per_km', color='red', title='Bottom 5 Least Efficient Vehicles (CO₂ per Km)')
plt.ylabel("CO₂ per Km")
plt.tight_layout()
plt.show()


In [None]:
sns.boxplot(data=df_cleaned, x='engine_norm_type', y='co2_per_km')
plt.title("CO₂ per Km by Engine Norm Type")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df_cleaned['high_emitter'] = df_cleaned['co2_per_km'] > 1.0
print(df_cleaned['high_emitter'].value_counts())


In [None]:
df_cleaned[df_cleaned['high_emitter']].groupby('vehicle_type').size().sort_values(ascending=False).plot(kind='bar', figsize=(10,5), color='crimson')
plt.title("Count of High Emitters by Vehicle Type")
plt.ylabel("Number of High Emitters")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df_cleaned['efficiency_score'] = (1 / df_cleaned['fuel_per_km']) + (1 / df_cleaned['co2_per_km'])
df_cleaned['efficiency_score'] = df_cleaned['efficiency_score'].replace([np.inf, -np.inf], np.nan)

# View top 10 efficient vehicles
top_eff = df_cleaned[['vehicle_type', 'fuel_type', 'efficiency_score']].dropna().sort_values('efficiency_score', ascending=False).head(10)
print(top_eff)


In [None]:
df_cleaned['low_efficiency'] = df_cleaned['fuel_per_km'] > df_cleaned['fuel_per_km'].quantile(0.75)

print(df_cleaned['low_efficiency'].value_counts())

# Visualize
df_cleaned[df_cleaned['low_efficiency']].groupby('vehicle_type').size().sort_values(ascending=False).plot(kind='bar', figsize=(10,5), color='orange')
plt.title("Count of Low Fuel-Efficient Vehicles by Type")
plt.ylabel("Number of Low Efficiency Vehicles")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df_cleaned['co2_tier'] = pd.qcut(df_cleaned['co2_per_km'], q=3, labels=['Low', 'Medium', 'High'])
sns.countplot(data=df_cleaned, x='co2_tier', hue='fuel_type')
plt.title("Emission Tiers by Fuel Type")
plt.tight_layout()
plt.show()


In [None]:
filtered_df = df_cleaned[~df_cleaned['logistics_partner'].isin(['Unknown', 'unknown']) & df_cleaned['logistics_partner'].notna()]
logistics_efficiency = df_cleaned.groupby('logistics_partner')['co2_per_km'].mean().sort_values()

# Top 5 most efficient
top5_logistics = logistics_efficiency.head(5)
print("Top 5 Efficient Logistics Partners (Lowest CO₂ per Km):")
print(top5_logistics)

# Bottom 5 least efficient
bottom5_logistics = logistics_efficiency.tail(5)
print("\nBottom 5 Least Efficient Logistics Partners (Highest CO₂ per Km):")
print(bottom5_logistics)


In [None]:
# Bar Plot
plt.figure(figsize=(10,5))
top5_logistics.plot(kind='bar', color='green')
plt.title('Top 5 Efficient Logistics Partners (CO₂ per Km)')
plt.ylabel('Average CO₂ per Km')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,5))
bottom5_logistics.plot(kind='bar', color='red')
plt.title('Bottom 5 Least Efficient Logistics Partners (CO₂ per Km)')
plt.ylabel('Average CO₂ per Km')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
df_cleaned.to_csv("../dataset/carbon_emission_final_engineered.csv", index=False)