In [104]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("data/df_main.csv")

## Prepare data

In [4]:
df_day = df.groupby(["day", "weekday","month", "TAVG" ]).count().reset_index()
df_day = df_day[["day", "weekday", "month", "TAVG", "Duration"]]
df_day = df_day.rename(columns={"Duration": "count"})

### Train-test-split

In [128]:
X_t, X_test, y_t, y_test = train_test_split(df_day.iloc[:,:-1], df_day.iloc[:,-1], test_size=0.33, random_state=42)

In [116]:
#check:
X_t.shape, X_test.shape, y_t.shape, y_test.shape

((244, 4), (121, 4), (244,), (121,))

### Feature Engineering

In [126]:
def feature_engineering(data):
    
    data_w = encode_weekends(data)
    data_h = encode_holidays(data_w)
    data_d = dummies_weekdays(data_h)
    data_p = polynomial_interaction(data_d)
    data_n = scaling_s(data_p)   # gets worse with scaling??
    
    return data_n

In [118]:
def encode_weekends(X_train):
    X_train.loc[(X_train["weekday"] == "Saturday") | (X_train["weekday"] == "Sunday"), "weekend"] = 1
    X_train["weekend"] = X_train["weekend"].fillna(0)
    return X_train

In [119]:
def encode_holidays(X_train):
    cal = calendar()
    holidays = list(cal.holidays(start=df_day["day"].min(), end=df_day["day"].max()))
    holidays = [str(x.date()) for x in holidays]
    X_train.loc[(df_day["day"].isin(holidays)), "holiday"] = 1
    X_train["holiday"] = X_train["holiday"].fillna(0)
    X_train.set_index("day", inplace=True)
    return X_train    

In [120]:
def dummies_weekdays(X_train):
    dummies = pd.get_dummies(X_train['weekday'])
    X_train = pd.concat([X_train, dummies.set_index(X_train.index)], axis=1) 
    X_train.drop(["Monday", "weekday"], axis=1, inplace=True)
    return X_train

In [121]:
def polynomial_interaction(X_train):
    poly = PolynomialFeatures(degree=2)
    x_poly = poly.fit_transform(X_train[["month", "TAVG"]])
    c_names = poly.get_feature_names()
    df_poly = pd.DataFrame(x_poly, columns= c_names)
    X_train = X_train.reset_index()
    X_train = pd.concat([X_train, df_poly], axis=1)
    return X_train

In [122]:
def scaling_s(X_train):
    Scale = StandardScaler()
    scaled = Scale.fit_transform(X_train[["month", "TAVG", "x0", "x1", "x0^2", "x0 x1", "x1^2"]])
    scaled = pd.DataFrame(scaled, columns=["month", "TAVG", "x0", "x1", "x0^2", "x0 x1", "x1^2"])
    X_train = X_train.drop(["month", "TAVG", "x0", "x1", "x0^2", "x0 x1", "x1^2"], axis=1)
    X_train = pd.concat([X_train, scaled], axis=1)
    X_train = X_train.set_index("day")
    return X_train

In [129]:
X = feature_engineering(X_t)

In [94]:
#X.set_index("day", inplace=True)

## Models

## KNN-Regressor

### Linear Regression with Regularization

In [109]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [131]:
m = Ridge(alpha=0.01) 
m.fit(X, y_train)  
# training accuracy
m.score(X, y_train)

0.7304403228418743

In [96]:
ypred = m.predict(X) 
mean_squared_error(y_train, ypred)

3080273.4895620258

In [None]:
# Hyperparameter optimization

###  TEST

In [132]:
x_test = feature_engineering(X_test)

In [99]:
#x_test.set_index("day", inplace=True)

In [133]:
# test-accuracy
m.score(x_test, y_test)

0.597818191409438

## Random Forest

In [67]:
from sklearn.ensemble import RandomForestRegressor

In [101]:
rf = RandomForestRegressor(max_depth=5)

In [134]:
rf.fit(X, y_train)  
# training accuracy:
rf.score(X, y_train.values.ravel())

0.8538172983162207

In [135]:
#test accuracy
rf.score(x_test, y_test)

0.6077591994535917

In [None]:
## Hyperparameter Optimization

In [None]:
#split train/test set
from sklearn.model_selection import train_test_split

In [None]:
Xtrain_forest, Xtest_forest, ytrain_forest, ytest_forest = train_test_split(Xtrain, ytrain, test_size=0.3, random_state=42)
Xtrain_forest.shape, ytrain_forest.shape

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'max_depth': [6, 8, 16],
    'n_estimators': [10, 100, 200]
}
param_grid

In [None]:
cv = GridSearchCV(rf, param_grid, 
                  return_train_score=True,
                  cv=2, n_jobs=4)

forest=cv.fit(Xtrain_forest, ytrain_forest)

ypred_forest=forest.predict(Xtest_forest)
forest.score(Xtest_forest,ytest_forest)

In [None]:
cv.best_estimator_

## Gradient Descent

In [None]:
temp = df[['temp_scaled', 'count']]

# Building the model
a = 100
b = 200

learning_rate = 0.01
max_iterations = 15000  
threshold = 0.01

n = temp['count'].count()

In [None]:
# applying Gradient Descent 
for i in range(max_iterations): 
    temp['count_pred'] = temp['temp_scaled']*b + a    # temp_scaled = xtrain, count_pred = ypred
    derivative_a = (-2/n) * sum(temp['count'] - temp['count_pred'])  # derivative of loss function
    derivative_b = (-2/n) * sum((temp['count'] - temp['count_pred'])*temp['temp_scaled'])
    mse = 1/n*sum((temp['count'] - temp['count_pred'])**2)
    grad_norm = np.sqrt(derivative_a**2 + derivative_b**2)   #???
    if grad_norm <= threshold:
        print (a, b, mse, 'threshold reached')
        break
      
    a = a - learning_rate * derivative_a  # Update a
    b = b - learning_rate * derivative_b  # Update b
    
    if i % 100 == 0:
        print (i, a, b, mse, grad_norm)

In [None]:
derivative_a

In [None]:
# After getting good models: retrain them on ALL data! 