## Experminet Notebook
This notebook includes the process of training multiple models, monitoring performance, comparison and model selection over the same dataset.

## Global Variables

## Initiliazation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from datetime import datetime

In [2]:
dataframe = pd.read_csv("./KAG_energydata_complete.csv")
dataframe

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


## Research Engine

In [3]:
def cyclic_encode(data, max_val):
  return np.column_stack([
        np.sin(2 * np.pi * data / max_val),
        np.cos(2 * np.pi * data / max_val)
    ])

def ohe_encode(data):
  encoder_data = OneHotEncoder(sparse_output=False)
  return (encoder_data.fit_transform(data.reshape(-1, 1)), encoder_data)

def binned_encode(data):
  pass

A class to train models with different specifications in just one line.

In [4]:
class ModelCreator:
  def __init__(self, weekday_enc='ohe', hour_enc='ohe', reg_type='lasso', poly_deg=1, random_state=None, trainer=None):
    self.params = {
      'weekday_enc': weekday_enc,
      'hour_enc': hour_enc,
      'reg_type': reg_type,
      'poly_deg': poly_deg,
      'random_state': random_state,
      'trainer': trainer
    }
    # self.model = None
    # self.results = {}
    
  def _process_dataframe(self, df):
    # --- Handle The Dataframe ---
    dataframe = df.copy()
    date_column = pd.to_datetime(dataframe['date'])
    hours = date_column.dt.hour.values # extracts hour column as [0, 0, ..., 2, 2, ...]
    weekdays = date_column.dt.weekday.values # extracting weekdays like [0, 0, ..., 1, 1, ...]
    dataframe = dataframe.drop('date', axis=1)
    
    # --- Handle Hour Encoding ---
    if self.params['hour_enc'] == 'ohe':
      hour_encoded, encoder_hour = ohe_encode(hours)
      hour_column_names = encoder_hour.get_feature_names_out(['hour'])
      self.encoder_hour = encoder_hour
    elif self.params['hour_enc'] == 'trig':
      hour_encoded = cyclic_encode(hours, 24)
      hour_column_names = np.array(['hour_sin', 'hour_cos'])
    elif self.params['hour_enc'] == 'binned':
      pass
    else: raise Exception("this hour_enc is not supported or is not written correctly, please double check. supported hour_enc values: ohe, trig, binned")
    # put the hour-of-day into dataframe
    hour_encoded_df = pd.DataFrame(hour_encoded, columns=hour_column_names)
    dataframe = pd.concat([hour_encoded_df, dataframe], axis=1)
    
    # --- Handle Weekday Encoding ---   
    if self.params['weekday_enc'] == 'ohe':
      weekdays_encoded, encoder_weekdays = ohe_encode(weekdays)
      weekdays_column_names = encoder_weekdays.get_feature_names_out(['weekday'])
    elif self.params['weekday_enc'] == 'trig':
      weekdays_encoded = cyclic_encode(weekdays, 7)
      weekdays_column_names = np.array(['weekday_sin', 'weekday_cos'])
    elif self.params['weekday_enc'] == 'binned':
      pass
    # Putting weekday columns into dataframe
    week_encoded_df = pd.DataFrame(weekdays_encoded, columns=weekdays_column_names)
    dataframe = pd.concat([week_encoded_df, dataframe], axis=1)
    
    # --- Handle Polynomial Features ---
    if 'Appliances' in df:
      X = dataframe.drop('Appliances', axis=1)
      y = dataframe[['Appliances']]
    else:
      X = dataframe
      y = None
    
    if self.params['poly_deg'] > 1:
      poly = PolynomialFeatures(degree=self.params['poly_deg'], include_bias=False)
      X = poly.fit_transform(X)  
      # Filtering out bad ohe combinations
      ohe_counts = []
      if self.params['weekday_enc'] == 'ohe':
        if self.params['hour_enc'] == 'ohe':
          ohe_counts = [(0, 7), (7, 7 + 24)]
        elif self.params['hour_enc'] == 'binned':
          pass
        elif self.params['hour_enc'] == 'trig':
          ohe_counts = [(0, 7)]
      if self.params['weekday_enc'] == 'trig':
        if self.params['hour_enc'] == 'ohe':
          ohe_counts = [(2, 2 + 24)]
        elif self.params['hour_enc'] == 'binned':
          pass
      if self.params['weekday_enc'] == 'binned':
        if self.params['hour_enc'] == 'ohe':
          pass
        elif self.params['hour_enc'] == 'binned':
          pass
        elif self.params['hour_enc'] == 'trig':
          pass
      self.valid = []
      if len(ohe_counts) > 0:
        feature_powers = poly.powers_
        for i in range(len(feature_powers)):
          # if (feature_powers[i, ohe_counts[0][0]:ohe_counts[0][1]].sum() <= 1
          #     and feature_powers[i, ohe_counts[1][0]:ohe_counts[1][1]].sum() <= 1): self.valid.append(i)
          isValid = True
          for item in ohe_counts:
            if feature_powers[i, item[0]:item[1]].sum() > 1: isValid = False
          if isValid: self.valid.append(i)
        X = X[:, self.valid]
    return (X, y)      

  def _extract_weekday(self, date_string: str) -> int:
    return datetime.strptime(date_string, "%Y-%m-%d").weekday()

  def fit(self, df):
    """Constructs the pipeline and trains it."""
    # 1. Build Pipeline (Preprocessor -> Poly -> Scaler -> Regressor)
    # 2. Store it in self.pipeline
    # 3. Use LassoCV or RidgeCV based on self.params['reg_type']
    
    X, y = self._process_dataframe(df)

    # --- Split ---
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.params['random_state'])
    y_train = y_train.values.squeeze()
    y_test = y_test.values.squeeze()
    
    self.X_test = X_test
    self.y_test = y_test
    self.X_train = X_train
    self.y_train = y_train
    
    # --- PipeLine: Feature scaling & Training ---
    if self.params['reg_type'] == 'lasso':
      if self.params['trainer'] == None:
        self.params['trainer'] = LassoCV(cv=5, random_state=self.params['random_state'], tol=0.1, n_jobs=-1, max_iter=40000)
      self.pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('lasso', self.params['trainer'])
      ])
    elif self.params['reg_type'] == 'ridge':
      pass
    else: raise Exception("this reg_type is not supported or is not written correctly, please double check. supported reg_type values: lasso, ridge")
  
    self.pipeline.fit(X_train, y_train)

    return self

  def predict(self, df, return_labels=False):
    """Gets raw dataframe and outputs model's predictions. Dataframe's structure should be exactly like the original dataframe, with the exception that including Appliance column is optional"""
    X, y = self._process_dataframe(df)
    y_pred = self.pipeline.predict(X)
    if return_labels: return (y_pred, y)
    return y_pred
    
  
  # def evaluate(self, X_test, y_test):
    # """Calculates metrics and stores them in self.results."""
    # y_pred = self.model.predict(X_test)
    # self.results['rmse'] = np.sqrt(mean_squared_error(y_test, y_pred))
    # self.results['r2'] = r2_score(y_test, y_pred)
    # return self.results

## Training Models

In [5]:
linear_lasso = ModelCreator(random_state=42)
linear_lasso.fit(dataframe)
y_pred = linear_lasso.pipeline.predict(linear_lasso.X_test)
RMSE = np.sqrt(mean_squared_error(linear_lasso.y_test, y_pred))

RMSE

np.float64(87.67673464772226)