# 03 – Model Training (City-level EV Count)

Objective: Train regression models to predict yearly EV count per city; select best by validation R² and MAE.

Inputs: `data/processed/model_ready_city_ev.csv`
Outputs: `models/best_model.pkl`, `data/processed/model_eval.csv`


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib


In [3]:
# Resolve paths robustly
CWD = Path.cwd()
# Check if we're in notebooks/ directory or project root
if CWD.name == 'notebooks':
    # We're in notebooks/ directory, go up one level
    BASE_DIR = CWD.parent
else:
    # We're in project root
    BASE_DIR = CWD

IN_PATH = BASE_DIR / 'data/processed/model_ready_city_ev.csv'
MODEL_DIR = BASE_DIR / 'models'
EVAL_OUT = BASE_DIR / 'data/processed/model_eval.csv'

if not IN_PATH.exists():
    raise FileNotFoundError(f"Missing input: {IN_PATH}")

MODEL_DIR.mkdir(parents=True, exist_ok=True)

print(f"Current directory: {CWD}")
print(f"Base directory: {BASE_DIR}")
print(f"Input path: {IN_PATH}")
print(f"Model directory: {MODEL_DIR}")
print(f"Evaluation output: {EVAL_OUT}")
print(f"Input exists: {IN_PATH.exists()}")
print(f"Model directory exists: {MODEL_DIR.exists()}")


Current directory: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\notebooks
Base directory: e:\FDM\PROJECT\NEW\Smart-Charge-Locator
Input path: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\model_ready_city_ev.csv
Model directory: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\models
Evaluation output: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\model_eval.csv
Input exists: True
Model directory exists: True


In [4]:
# Load and prepare features
df = pd.read_csv(IN_PATH)

# Basic feature set
X = df[['Model Year','Prev_Year_EV_Count','Year_Delta']].copy()
y = df['EV_Count']

print(f"Data shape: {df.shape}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print("Feature columns:", list(X.columns))
print("Sample features:")
X.head()


Data shape: (5190, 5)
Features shape: (5190, 3)
Target shape: (5190,)
Feature columns: ['Model Year', 'Prev_Year_EV_Count', 'Year_Delta']
Sample features:


Unnamed: 0,Model Year,Prev_Year_EV_Count,Year_Delta
0,2011,0.0,11
1,2013,1.0,13
2,2014,8.0,14
3,2015,4.0,15
4,2016,5.0,16


In [5]:
# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Train target range: {y_train.min()} - {y_train.max()}")
print(f"Test target range: {y_test.min()} - {y_test.max()}")


Training set: 4152 samples
Test set: 1038 samples
Train target range: 1 - 8900
Test target range: 1 - 2330


In [6]:
# Define model candidates
candidates = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}

print("Model candidates:")
for name, model in candidates.items():
    print(f"  - {name}: {type(model).__name__}")


Model candidates:
  - LinearRegression: LinearRegression
  - Ridge: Ridge
  - RandomForest: RandomForestRegressor


In [7]:
# Train and evaluate models
records = []
best_name, best_model, best_score, best_mae = None, None, -np.inf, np.inf

print("Training and evaluating models...")
for name, model in candidates.items():
    print(f"  Training {name}...")
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    records.append({'model': name, 'r2': r2, 'mae': mae})
    print(f"    R² = {r2:.3f}, MAE = {mae:.3f}")
    
    if r2 > best_score or (r2 == best_score and mae < best_mae):
        best_name, best_model, best_score, best_mae = name, model, r2, mae

print(f"\nBest model: {best_name}")
print(f"Best R²: {best_score:.3f}")
print(f"Best MAE: {best_mae:.3f}")


Training and evaluating models...
  Training LinearRegression...
    R² = 0.701, MAE = 25.164
  Training Ridge...
    R² = 0.701, MAE = 25.164
  Training RandomForest...
    R² = 0.959, MAE = 9.342

Best model: RandomForest
Best R²: 0.959
Best MAE: 9.342


In [8]:
# Save results
pd.DataFrame(records).to_csv(EVAL_OUT, index=False)
joblib.dump(best_model, MODEL_DIR / 'best_model.pkl')

print("Results saved:")
print(f"  - Model evaluation: {EVAL_OUT}")
print(f"  - Best model: {MODEL_DIR / 'best_model.pkl'}")

print("\nModel evaluation results:")
eval_df = pd.DataFrame(records)
print(eval_df.sort_values('r2', ascending=False))


Results saved:
  - Model evaluation: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\data\processed\model_eval.csv
  - Best model: e:\FDM\PROJECT\NEW\Smart-Charge-Locator\models\best_model.pkl

Model evaluation results:
              model        r2        mae
2      RandomForest  0.959353   9.342170
1             Ridge  0.701039  25.163649
0  LinearRegression  0.701039  25.163656
