# Milestone 1 — EV Charging Demand Prediction

## 1. Problem Understanding

**Objective:** Build a machine-learning / time-series forecasting system that predicts EV charging demand at stations using historical usage data from the **UrbanEVDataset** (Shenzhen, China — Sep 2022 to Feb 2023).

### Use-Case Description
EV charging stations in urban areas experience highly variable demand patterns driven by:
- **Time of day** (rush hours vs. off-peak)
- **Day of week** (weekdays vs. weekends)
- **Season** (weather, holidays)
- **Station location** (residential area vs. commercial)

Accurate demand forecasting enables:
- Grid operators to balance electricity load
- Station managers to schedule maintenance
- EV drivers to find available chargers

### Input–Output Specification
| Field | Description |
|---|---|
| **Inputs** | Historical 5-min interval station data (busy/idle piles, energy volume, price, duration) |
| **Station Info** | GPS coordinates, pile counts, zone (TAZID) |
| **Outputs** | Hourly energy demand forecast (kWh) per zone |
| **Evaluation** | MAE, RMSE on held-out 20% test split |

## 2. Forecasting Pipeline (System Architecture)

```
┌─────────────────────────────────────────────────────────────────────┐
│                     FORECASTING PIPELINE                           │
│                                                                     │
│  Raw Station CSVs (5-min)                                           │
│       │                                                             │
│       ▼                                                             │
│  Data Cleaning & Gap-Filling (ffill/bfill)                          │
│       │                                                             │
│       ▼                                                             │
│  Remove Bad Stations (all-idle / header-only)                       │
│       │                                                             │
│       ▼                                                             │
│  Hourly Aggregation (per-station → sum/mean)                        │
│       │                                                             │
│  Station Info CSV ──► Zone Mapping (station_id → TAZID)            │
│       │                                                             │
│       ▼                                                             │
│  Zone-Level Hourly Demand (sum of station volumes per zone)         │
│       │                                                             │
│       ▼                                                             │
│  Feature Engineering                                                │
│     hour, dayofweek, month, season, is_weekend, lag_1, lag_24      │
│       │                                                             │
│       ▼                                                             │
│  Model Training (Linear Regression)                          │
│     Train: 80% | Test: 20% (time-ordered)                           │
│       │                                                             │
│       ▼                                                             │
│  Evaluation: MAE, RMSE per zone                                     │
│       │                                                             │
│       ▼                                                             │
│  Streamlit Dashboard (demand forecast + peak detection)             │
└─────────────────────────────────────────────────────────────────────┘
```

In [None]:
import pandas as pd
import numpy as np
import matplotlib .pyplot as plt
import seaborn as sns
import os
import glob
import warnings
warnings .filterwarnings ('ignore')
from sklearn .linear_model import LinearRegression
from sklearn .metrics import mean_absolute_error ,mean_squared_error
from sklearn .model_selection import train_test_split
BASE_DIR =os .path .dirname (os .path .abspath ('__file__'))if '__file__'in dir ()else os .getcwd ()
RAW_DIR =os .path .join (BASE_DIR ,'20220901-20230228_station-raw')
CLEAN_DIR =os .path .join (BASE_DIR ,'processed')
STATION_INFO_PATH =os .path .join (RAW_DIR ,'station_information.csv')
RAW_5MIN_GLOB =os .path .join (RAW_DIR ,'charge_5min','*.csv')
OUT_5MIN =os .path .join (CLEAN_DIR ,'charge_5min')
OUT_1HOUR =os .path .join (CLEAN_DIR ,'charge_1hour')
OUT_ZONE =os .path .join (CLEAN_DIR ,'zone_hourly_volume_long.csv')
os .makedirs (OUT_5MIN ,exist_ok =True )
os .makedirs (OUT_1HOUR ,exist_ok =True )
print (' Directories ready')
print (f'   RAW:     {RAW_DIR }')
print (f'   OUTPUT:  {CLEAN_DIR }')


## 3. Station Information & Zone Mapping

In [None]:
station_info =pd .read_csv (STATION_INFO_PATH )
print (f'Stations: {len (station_info )}')
print (f'Zones covered: {station_info ["TAZID"].nunique ()}')
station_info .head ()
station_info .describe ()

In [None]:
station_to_zone =dict (zip (station_info ['station_id'],station_info ['TAZID']))

## 4. Finding Bad Stations

In [None]:
# This is how we found the bad stations in kaggle before shifting to local

input_folder = '/kaggle/input/datasets/aayushchaturvedii/urbanevdataset/UrbanEVDataset/20220901-20230228_station-raw/charge_5min/*.csv'
output_folder = '/kaggle/working/charge_5min'
os.makedirs(output_folder, exist_ok=True)

files = glob.glob(input_folder)
# files = ['/kaggle/input/datasets/aayushchaturvedii/urbanevdataset/UrbanEVDataset/20220901-20230228_station-raw/charge_5min/1523.csv']

for file_path in files:
    df = pd.read_csv(file_path)
    station_id = int(file_path.split("/")[-1].replace(".csv", ""))
    zone_id = station_to_zone[station_id]
    df["TAZID"] = zone_id
    df["time"] = pd.to_datetime(df["time"])
    
    df = df.sort_values("time").reset_index(drop=True).set_index('time')

    full_index = pd.date_range(
        start=df.index.min(),
        end=df.index.max(),
        freq="5min"
    )
    df = df.reindex(full_index)
    
    df = df.ffill().bfill()
    
    df = df.reset_index().rename(columns={"index": "time"})

    if (df.isna().sum().sum() == 0):
        print("Success")
    else:
        print(file_path,"Fail")
        
    output_path = os.path.join(output_folder, f"{station_id}.csv")
    df.to_csv(output_path, index=False)

## 5. Preprocessing — All Stations (5-min cleaning + gap fill)

In [None]:
BAD_STATIONS ={2129 ,1663 ,1478 ,1082 ,1055 ,1722 ,1039 ,1036 ,1681 ,2125 ,1487 ,1113 ,2138 ,1034 ,1337 ,1497 ,2337 ,1501 ,1101 ,2291 }
EXPECTED_5MIN_FREQ ='5min'
files =sorted (glob .glob (RAW_5MIN_GLOB ))
print (f'Total raw station files: {len (files )}')


In [None]:
success ,skipped ,fail =[],[],[]
for file_path in files :
    station_id =int (os .path .basename (file_path ).replace ('.csv',''))
    if station_id in BAD_STATIONS :
        skipped .append (station_id )
        continue
    if station_id not in station_to_zone :
        skipped .append (station_id )
        continue
    zone_id =station_to_zone [station_id ]
    try :
        df =pd .read_csv (file_path )
        if len (df )<10 :
            skipped .append (station_id )
            continue
        df ['time']=pd .to_datetime (df ['time'])
        df ['TAZID']=zone_id
        df =df .sort_values ('time').set_index ('time')
        full_index =pd .date_range (start =df .index .min (),end =df .index .max (),freq =EXPECTED_5MIN_FREQ )
        df =df .reindex (full_index )
        df =df .ffill ().bfill ()
        df =df .reset_index ().rename (columns ={'index':'time'})
        null_count =df .isnull ().sum ().sum ()
        if null_count >0 :
            fail .append (station_id )
            continue
        out_path =os .path .join (OUT_5MIN ,f'{station_id }.csv')
        df .to_csv (out_path ,index =False )
        success .append (station_id )
    except Exception as e :
        print (f'   Station {station_id }: {e }')
        fail .append (station_id )
print (f'\n Cleaned: {len (success )} stations')
print (f'⏭  Skipped (bad/unmapped): {len (skipped )} stations')
print (f' Failed: {len (fail )} stations')


## 6. Preprocessing — Hourly Aggregation (per-station)

In [None]:
AGG_RULES ={'busy':'mean','idle':'mean','fast_busy':'mean','fast_idle':'mean','slow_busy':'mean','slow_idle':'mean','duration':'sum','volume':'sum','s_price':'mean','e_price':'mean','TAZID':'first',}
cleaned_files =sorted (glob .glob (os .path .join (OUT_5MIN ,'*.csv')))
print (f'Aggregating {len (cleaned_files )} stations to hourly …')
for file_path in cleaned_files :
    station_id =int (os .path .basename (file_path ).replace ('.csv',''))
    df =pd .read_csv (file_path )
    df ['time']=pd .to_datetime (df ['time'])
    df =df .set_index ('time')
    df_hourly =df .resample ('h').agg (AGG_RULES ).reset_index ()
    out_path =os .path .join (OUT_1HOUR ,f'{station_id }.csv')
    df_hourly .to_csv (out_path ,index =False )
hourly_files =glob .glob (os .path .join (OUT_1HOUR ,'*.csv'))
print (f' Hourly files written: {len (hourly_files )}')
sample_h =pd .read_csv (hourly_files [0 ])
print (f'   Sample shape: {sample_h .shape } — columns: {list (sample_h .columns )}')


## 7. Zone-Level Aggregation

In [None]:
all_data =[]
for file_path in hourly_files :
    df =pd .read_csv (file_path )
    df ['time']=pd .to_datetime (df ['time'])
    all_data .append (df [['time','TAZID','volume']])
all_data =pd .concat (all_data ,ignore_index =True )
zone_hourly =(all_data .groupby (['time','TAZID'],as_index =False ).agg ({'volume':'sum'}))
zone_hourly .to_csv (OUT_ZONE ,index =False )
print (f'Zone-hourly dataset: {zone_hourly .shape }')
print (f'Zones: {zone_hourly ["TAZID"].nunique ()}')
print (f'Time range: {zone_hourly ["time"].min ()} → {zone_hourly ["time"].max ()}')
zone_hourly .head ()


In [None]:
print ('Volume statistics (kWh) across all zones and hours:')
zone_hourly ['volume'].describe ()


## 8. Feature Engineering

In [None]:
def add_features (df :pd .DataFrame )->pd .DataFrame :
    df =df .copy ()
    df ['hour']=df .index .hour
    df ['dayofweek']=df .index .dayofweek
    df ['month']=df .index .month
    df ['is_weekend']=(df .index .dayofweek >=5 ).astype (int )
    df ['season']=df ['month'].map ({12 :0 ,1 :0 ,2 :0 ,3 :1 ,4 :1 ,5 :1 ,6 :2 ,7 :2 ,8 :2 ,9 :3 ,10 :3 ,11 :3 })
    df ['hour_sin']=np .sin (2 *np .pi *df ['hour']/24 )
    df ['hour_cos']=np .cos (2 *np .pi *df ['hour']/24 )
    df ['dow_sin']=np .sin (2 *np .pi *df ['dayofweek']/7 )
    df ['dow_cos']=np .cos (2 *np .pi *df ['dayofweek']/7 )
    df ['lag_1h']=df ['volume'].shift (1 )
    df ['lag_24h']=df ['volume'].shift (24 )
    df ['lag_168h']=df ['volume'].shift (168 )
    df ['roll_24h_mean']=df ['volume'].shift (1 ).rolling (24 ).mean ()
    return df .dropna ()
FEATURES =['hour','dayofweek','month','is_weekend','season','hour_sin','hour_cos','dow_sin','dow_cos','lag_1h','lag_24h','lag_168h','roll_24h_mean']
print ('Feature set:')
for f in FEATURES :
    print (f'  · {f }')


# 9. Model Training Per Zone

In [None]:
zone_hourly_df = pd.read_csv(OUT_ZONE)
zone_hourly_df["time"] = pd.to_datetime(zone_hourly_df["time"])
zones = zone_hourly_df["TAZID"].unique()
results = []
evaluated = 0
for zone_id in zones:
    zdf = (
        zone_hourly_df[zone_hourly_df["TAZID"] == zone_id]\
        .copy().sort_values("time").set_index("time")\
    )
    zdf = add_features(zdf)
    if len(zdf) < 200:
        continue
    X, y = zdf[FEATURES], zdf["volume"]
    split = int(len(X) * 0.8)
    X_tr, X_te = X.iloc[:split], X.iloc[split:]
    y_tr, y_te = y.iloc[:split], y.iloc[split:]
    model = LinearRegression()
    model.fit(X_tr, y_tr)
    y_pred = np.maximum(model.predict(X_te), 0)
    mae  = mean_absolute_error(y_te, y_pred)
    rmse = np.sqrt(mean_squared_error(y_te, y_pred))
    results.append({"zone": zone_id, "model": "LinearRegression", "MAE": mae, "RMSE": rmse})
    evaluated += 1
results_df = pd.DataFrame(results)
results_df.to_csv(OUT_RESULTS, index=False)
print(f"   Evaluated {evaluated} zones")
print(f"\n   Mean performance across all zones ")
print(results_df[["MAE", "RMSE"]].mean().round(3).to_string())