# AML Project 3

# Imports

In [106]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load Data

In [107]:
plant1 = pd.read_csv("Plant_1_Generation_Data.csv")
df = plant1.copy()

df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'], format="%d-%m-%Y %H:%M", dayfirst=True)
df = df.sort_values('DATE_TIME')

daily = (
    df.groupby(df['DATE_TIME'].dt.date)['DAILY_YIELD']
      .sum()
      .reset_index()
      .rename(columns={'DATE_TIME': 'date', 'DAILY_YIELD': 'daily_yield'})
)

daily['date'] = pd.to_datetime(daily['date'])

# Engineer Features

In [108]:
# Basic time features
daily['dayofyear'] = daily['date'].dt.dayofyear
daily['month'] = daily['date'].dt.month
daily['week'] = daily['date'].dt.isocalendar().week.astype(int)
daily['weekday'] = daily['date'].dt.weekday
daily['is_weekend'] = daily['weekday'].isin([5,6]).astype(int)
daily['is_month_start'] = daily['date'].dt.is_month_start.astype(int)
daily['is_month_end'] = daily['date'].dt.is_month_end.astype(int)
daily['quarter'] = daily['date'].dt.quarter

# Lag features
for lag in [1,2,3,7,14]:
    daily[f'lag_{lag}'] = daily['daily_yield'].shift(lag)

# Rolling statistics
for window in [3,7,14]:
    daily[f'ma_{window}'] = daily['daily_yield'].rolling(window).mean()
    daily[f'std_{window}'] = daily['daily_yield'].rolling(window).std()
    daily[f'max_{window}'] = daily['daily_yield'].rolling(window).max()
    daily[f'min_{window}'] = daily['daily_yield'].rolling(window).min()

# Growth/change features
daily['diff_1'] = daily['daily_yield'].diff(1)
daily['diff_7'] = daily['daily_yield'].diff(7)
daily['pct_change_1'] = daily['daily_yield'].pct_change(1)
daily['pct_change_7'] = daily['daily_yield'].pct_change(7)

# Seasonal encoding
daily['sin_dayofyear'] = np.sin(2*np.pi*daily['dayofyear']/365)
daily['cos_dayofyear'] = np.cos(2*np.pi*daily['dayofyear']/365)

# Train/Test Split

In [109]:
daily_full = daily.copy()

train = daily_full.iloc[:-14]
test = daily_full.iloc[-14:]

train = train.dropna()
X_train = train.drop(columns=['date', 'daily_yield'])
y_train = train['daily_yield']

X_test = test.drop(columns=['date', 'daily_yield'])
y_test = test['daily_yield']

# Preprocessing And Pipeline

In [110]:
model = Ridge(alpha=1.0)  # L2 regularization
model.fit(X_train, y_train)

# Prediction

In [111]:
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 473494.00308204634
RMSE: 574611.61202544


In [112]:
results = pd.DataFrame({
    "date": test['date'],
    "actual": y_test.values,
    "predicted": preds
})

print(results)

         date        actual     predicted
20 2020-06-04  6.091402e+06  6.568513e+06
21 2020-06-05  6.384127e+06  7.102767e+06
22 2020-06-06  5.076817e+06  5.849498e+06
23 2020-06-07  7.747108e+06  7.950903e+06
24 2020-06-08  8.505252e+06  7.891317e+06
25 2020-06-09  8.260297e+06  7.268404e+06
26 2020-06-10  6.679794e+06  6.666732e+06
27 2020-06-11  5.538094e+06  6.593008e+06
28 2020-06-12  5.848817e+06  6.452223e+06
29 2020-06-13  8.705923e+06  8.621451e+06
30 2020-06-14  7.240643e+06  7.077198e+06
31 2020-06-15  7.137433e+06  7.426799e+06
32 2020-06-16  6.512585e+06  6.635968e+06
33 2020-06-17  6.138883e+06  6.657697e+06
