In [1]:
# Cell 1: Imports
import pandas as pd
import pickle
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor # Changed import
from sklearn.metrics import mean_absolute_error, r2_score



In [None]:
# Cell 2: Load Data
df = pd.read_csv('../../data/processed/model_ready_ev_data.csv')



In [None]:
# Cell 3: Define Features (X) and Target (y) and Split
X = df[['Postal Code', 'Model Year', 'Prev_Year_EV_Count', 'Year_Delta']]
y = df['EV_Count']
X_train, X_test = X[X['Model Year'] <= 2022], X[X['Model Year'] > 2022]
y_train, y_test = y.loc[X_train.index], y.loc[X_test.index]



In [None]:
# Cell 4: Create Preprocessing and Model Pipeline
preprocessor = ColumnTransformer(transformers=[('num', 'passthrough', ['Model Year', 'Prev_Year_EV_Count', 'Year_Delta']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['Postal Code'])])
# Changed model: n_jobs=-1 uses all available CPU cores to speed up training
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))])



In [None]:
# Cell 5: Train and Evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"--- Random Forest Evaluation ---")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
print(f"R-squared (R2) Score: {r2_score(y_test, y_pred):.2f}")



In [None]:
# Cell 6: Save Model
model_path = '../../models/random_forest.pkl' # Changed filename
with open(model_path, 'wb') as file:
    pickle.dump(pipeline, file)
print(f"Model saved to: {model_path}")