In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pandas as pd
import os
from common import data_folder_path
import matplotlib.pyplot as plt
import numpy as np

In [2]:
data_file = 'train.csv'
df = pd.read_csv(os.path.join(data_folder_path,data_file))
df['datetime'] = pd.to_datetime(df['datetime'])
df['Time of Day']=df['datetime'].dt.hour
df['Day of Week']=df['datetime'].dt.day_of_week
df

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,Time of Day,Day of Week
0,0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0,0,2
1,0,0,1,96.590,1,2021-09-01 00:00:00,0,1,0,0,2
2,0,0,2,0.000,0,2021-09-01 00:00:00,0,2,1,0,2
3,0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1,0,2
4,0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...
2018347,15,1,0,197.233,1,2023-05-31 23:00:00,637,2018347,64,23,2
2018348,15,1,1,0.000,0,2023-05-31 23:00:00,637,2018348,59,23,2
2018349,15,1,1,28.404,1,2023-05-31 23:00:00,637,2018349,59,23,2
2018350,15,1,3,0.000,0,2023-05-31 23:00:00,637,2018350,60,23,2


In [3]:
#based on this https://seasonsyear.com/Estonia
def get_season(row):
    month = row.month
    if month in [12,1,2,3]: #'Snow starts to fall at around middle of December and remains to mid-March. By the beginning of December temperature of water finally drops below zero (0C) thus the ice begins to appear by the coasts.'
        return 'Winter'
    elif month in [4,5]:
        return 'Spring'
    elif month in [6,7,8]:
        return 'Summer' #'Climate of the summer months: ... June ... July ... August'
    else:
        return 'Autumn' #'In Estonia coming of actual Autumn is almost simultaneous to calendar change of seasons.'

df['Season'] = df['datetime'].apply(get_season)
df['Season'].value_counts()#.plot(kind='bar')
assert df['Season'].value_counts().sum()==df.shape[0] #sanity check that all rows have a season

In [9]:
#print('product_type',product_type)
train_cols = ['county','Time of Day','Day of Week', 'Season','is_business','is_consumption','product_type','target']
train_data = df[train_cols].dropna()

X,y = train_data.iloc[:,:-1],train_data.iloc[:,-1]
#print(X,y)
y_ss = StandardScaler()
y = y_ss.fit_transform(y.values.reshape(-1,1)).flatten()

tod_onehot = pd.get_dummies(X['Time of Day'],prefix='tod')
X = X.join(tod_onehot)
seasons = X['Season'].astype('category')
X['Season'] = seasons.cat.codes

X = X.drop(columns=['Season','county','Day of Week','Time of Day','product_type','is_business','is_consumption'])




print('splitting dataset...')
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

print("training regression...")
lr = DecisionTreeRegressor(max_depth=10)
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)
y_test = y_ss.inverse_transform(y_test.reshape(-1, 1))
y_pred = y_ss.inverse_transform(y_pred.reshape(-1, 1))

train_mae = mean_absolute_error(y_train,y_ss.inverse_transform(lr.predict(X_train).reshape(-1, 1)))
test_mae = mean_absolute_error(y_test,y_pred)



#coefs = list(zip(lr.coef_,lr.feature_names_in_))
train_mae,test_mae#,coefs

splitting dataset...
training regression...


(274.2199807077129, 368.1615600137166)

In [10]:
#print('product_type',product_type)
train_cols = ['county','Time of Day','Day of Week', 'Season','is_business','is_consumption','product_type','target']
train_data = df[train_cols].dropna()

X,y = train_data.iloc[:,:-1],train_data.iloc[:,-1]
#print(X,y)
y_ss = StandardScaler()
y = y_ss.fit_transform(y.values.reshape(-1,1)).flatten()

tod_onehot = pd.get_dummies(X['Time of Day'],prefix='tod')
X = X.join(tod_onehot)
seasons = X['Season'].astype('category')
X['Season'] = seasons.cat.codes

X = X.drop(columns=['Season','county','Day of Week','Time of Day','product_type','is_business','is_consumption'])




print('splitting dataset...')
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

print("training regression...")
lr = RandomForestRegressor(max_depth=10)
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)
y_test = y_ss.inverse_transform(y_test.reshape(-1, 1))
y_pred = y_ss.inverse_transform(y_pred.reshape(-1, 1))

train_mae = mean_absolute_error(y_train,y_ss.inverse_transform(lr.predict(X_train).reshape(-1, 1)))
test_mae = mean_absolute_error(y_test,y_pred)



#coefs = list(zip(lr.coef_,lr.feature_names_in_))
train_mae,test_mae#,coefs

splitting dataset...
training regression...


(274.27281783037387, 368.19077399717713)