In [9]:
import pandas as pd
import os
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

data_dir = "../data/processed"

def prepare_vehicle_data(filename, bev_col='BEV', af_cols=['BEV','PHEV','H2','LPG','CNG','LNG'], suffix=''):
    df = pd.read_csv(os.path.join(data_dir, filename))
    # Ensure columns exist
    cols = [c for c in af_cols if c in df.columns]
    # Group by Country, Year and sum/mean as appropriate
    group_cols = ['Country', 'Year']
    agg_df = df.groupby(group_cols)[cols].sum().reset_index()
    # Create AF column
    agg_df[f'AF{suffix}'] = agg_df[cols].sum(axis=1)
    # Keep only BEV and AF
    out = agg_df[group_cols + [bev_col, f'AF{suffix}']]
    out = out.rename(columns={bev_col: f'BEV{suffix}'})
    return out

# Prepare each vehicle dataset
fleet = prepare_vehicle_data("combined_fleet_data.csv", suffix='_fleet')
newreg = prepare_vehicle_data("combined_newreg_data.csv", suffix='_newreg')
mktshare = prepare_vehicle_data("combined_registrations_mktshare_data.csv", suffix='_mktshare')

# Merge all into one
vehicle_data = fleet.merge(newreg, on=['Country','Year']).merge(mktshare, on=['Country','Year'])
vehicle_data.to_csv(os.path.join(data_dir, "combined_vehicle_data.csv"), index=False)

# Load AQ annual averages
aq = pd.read_csv(os.path.join(data_dir, "AQ_annual_averages.csv"))

# Merge vehicle data with AQ data
data = aq.merge(vehicle_data, on=['Country','Year'])

# Identify feature columns (all BEV/AF columns)
feature_cols = [col for col in data.columns if col.startswith('BEV') or col.startswith('AF')]
# Identify target columns (all pollutant annual averages)
target_cols = [col for col in data.columns if 'AnnualAvg' in col and col not in feature_cols]

# For each pollutant (Pollutant column), run regression for each annual average column
for pollutant in data['Pollutant'].unique():
    pollutant_data = data[data['Pollutant'] == pollutant]
    print(f"\n=== Modeling for pollutant: {pollutant} ===")
    for target in [c for c in target_cols if pollutant in c or c.endswith('_'+pollutant)]:
        print(f"\nTarget: {target}")
        for feature in feature_cols:
            X = pollutant_data[[feature]]
            y = pollutant_data[target]
            mask = y.notnull() & X[feature].notnull()
            if mask.sum() < 10:  # Not enough data
                print(f"  Skipping {feature} (not enough data)")
                continue
            X_train, X_test, y_train, y_test = train_test_split(X[mask], y[mask], test_size=0.2, random_state=42)
            reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
            models, predictions = reg.fit(X_train, X_test, y_train, y_test)
            print(f"\nFeature: {feature}")
            print(models.head(5))  # Show top 5 models

print("Analysis complete.")


=== Modeling for pollutant: CO2 ===

=== Modeling for pollutant: NO ===

=== Modeling for pollutant: NO2 ===

=== Modeling for pollutant: NOX as NO2 ===

=== Modeling for pollutant: PM10 ===

=== Modeling for pollutant: PM2.5 ===
Analysis complete.


In [None]:
import pandas as pd
import os
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split

data_dir = "../data/processed"


def prepare_vehicle_data(filename, bev_col='BEV', af_cols=['BEV','PHEV','H2','LPG','CNG','LNG'], suffix=''):
    df = pd.read_csv(os.path.join(data_dir, filename))
    cols = [c for c in af_cols if c in df.columns]
    group_cols = ['Country', 'Year']
    agg_df = df.groupby(group_cols)[cols].sum().reset_index()
    agg_df[f'AF{suffix}'] = agg_df[cols].sum(axis=1)
    out = agg_df[group_cols + [bev_col, f'AF{suffix}']]
    out = out.rename(columns={bev_col: f'BEV{suffix}'})
    return out

fleet = prepare_vehicle_data("combined_fleet_data.csv", suffix='_fleet')
newreg = prepare_vehicle_data("combined_newreg_data.csv", suffix='_newreg')
mktshare = prepare_vehicle_data("combined_registrations_mktshare_data.csv", suffix='_mktshare')

vehicle_data = fleet.merge(newreg, on=['Country','Year']).merge(mktshare, on=['Country','Year'])
vehicle_data.to_csv(os.path.join(data_dir, "combined_vehicle_data.csv"), index=False)

aq = pd.read_csv(os.path.join(data_dir, "AQ_annual_averages.csv"))
data = aq.merge(vehicle_data, on=['Country','Year'])

feature_cols = [col for col in data.columns if col.startswith('BEV') or col.startswith('AF')]
target_cols = [col for col in data.columns if 'AnnualAvg' in col and col not in feature_cols]

# Collect all results in a list of dicts
results_list = []

print("Merged data shape:", data.shape)
print(data.head())
print("Feature columns:", feature_cols)
print("Target columns:", target_cols)

for pollutant in data['Pollutant'].unique():
    pollutant_data = data[data['Pollutant'] == pollutant]
    for target in target_cols:
        for feature in feature_cols:
            X = pollutant_data[[feature]]
            y = pollutant_data[target]
            mask = y.notnull() & X[feature].notnull()
            print(f"Pollutant: {pollutant}, Target: {target}, Feature: {feature}, Valid rows: {mask.sum()}")
            if mask.sum() < 10:
                continue
            X_train, X_test, y_train, y_test = train_test_split(X[mask], y[mask], test_size=0.2, random_state=42)
            reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
            models, predictions = reg.fit(X_train, X_test, y_train, y_test)
            # Add identifying columns to the models DataFrame
            models = models.reset_index().rename(columns={'index': 'Model'})
            models['Pollutant'] = pollutant
            models['Target'] = target
            models['Feature'] = feature
            results_list.append(models)

# Concatenate all results
all_results = pd.concat(results_list, ignore_index=True)
# Save to CSV
all_results.to_csv(os.path.join(data_dir, "model_results_summary.csv"), index=False)

if not results_list:
    print("No valid model runs: check your data and thresholds!")
else:
    all_results = pd.concat(results_list, ignore_index=True)
    all_results.to_csv(os.path.join(data_dir, "model_results_summary.csv"), index=False)
    print("Saved all model results to model_results_summary.csv")
    print(all_results.head())