In [132]:
import sklearn
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [133]:
# Read weather and load csv
# Drop "HE" and nulls
# Add a day and hour count variable
dataset = pd.read_csv("weather and load.csv")
dataset = pd.DataFrame(dataset)
dataset = dataset.drop(columns="HE")
dataset = dataset.dropna()

for x in range(8):
    dataset[f"day_{x}"] = (dataset["Day Of Week"] == x) * 1
dataset["hour_count"] = list(range(len(dataset)))
dataset["hour"] = dataset.Dates.apply(lambda x: int(x.split()[-1].split(":")[0]))
dataset.head(30)

Unnamed: 0,Dates,Day Of Week,Temp,Dew Point,Humidity,Cloud Cover,Load,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,hour_count,hour
0,1/1/2014 0:00,4,41,35,79,30,2096,0,0,0,0,1,0,0,0,0,0
1,1/1/2014 1:00,4,41,34,76,30,1986,0,0,0,0,1,0,0,0,1,1
2,1/1/2014 2:00,4,39,31,73,30,1936,0,0,0,0,1,0,0,0,2,2
3,1/1/2014 3:00,4,39,32,76,30,1896,0,0,0,0,1,0,0,0,3,3
4,1/1/2014 4:00,4,41,32,70,30,1899,0,0,0,0,1,0,0,0,4,4
5,1/1/2014 5:00,4,41,31,67,30,1922,0,0,0,0,1,0,0,0,5,5
6,1/1/2014 6:00,4,41,30,65,30,2011,0,0,0,0,1,0,0,0,6,6
7,1/1/2014 7:00,4,39,27,62,30,2055,0,0,0,0,1,0,0,0,7,7
8,1/1/2014 8:00,4,49,28,44,30,2141,0,0,0,0,1,0,0,0,8,8
9,1/1/2014 9:00,4,60,28,30,30,2250,0,0,0,0,1,0,0,0,9,9


In [134]:
# Split date and hour into separate columns
dataset.Dates[0].split()

['1/1/2014', '0:00']

In [135]:
# Set X and y 
X = dataset[
    ["Temp", "Dew Point",
     "Humidity", "Cloud Cover", "day_0", 
     "day_1", "day_2", "day_3", "day_4", "day_5", 
     "day_6", "day_7", "hour_count", "hour"]]
y = dataset[["Load"]]

# Create TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=4)
print(tscv)  

# Loop through TimeSeriesSplits and train RandomForrestRegressor
for train_index, test_index in tscv.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Set RFR params
    rfr = RandomForestRegressor(max_features=.8, max_depth=7, min_samples_leaf=3, max_leaf_nodes=200)
    rfr.fit(X_train, y_train)
    print(rfr.score(X_train, y_train))
    print(rfr.score(X_test, y_test))

TimeSeriesSplit(max_train_size=None, n_splits=4)
TRAIN: [   0    1    2 ... 8764 8765 8766] TEST: [ 8767  8768  8769 ... 17527 17528 17529]
0.9545140398299554
0.8270205062929696
TRAIN: [    0     1     2 ... 17527 17528 17529] TEST: [17530 17531 17532 ... 26290 26291 26292]




0.9368552894605255
0.7917333792563221
TRAIN: [    0     1     2 ... 26290 26291 26292] TEST: [26293 26294 26295 ... 35053 35054 35055]




0.9205544908110403
0.6898483597288861
TRAIN: [    0     1     2 ... 35053 35054 35055] TEST: [35056 35057 35058 ... 43816 43817 43818]




0.9020804005131513
0.8200887389063859


In [141]:
#Create DF of the actual and predicted loads
pred_y = rfr.predict(X_test)

df = pd.DataFrame({ "Hour Count" : X_test.hour_count,
                    "Actual: " : y_test.values.ravel(),
                    "Predicted: " : pred_y.ravel()})
df.head()

Unnamed: 0,Hour Count,Actual:,Predicted:
35056,35056,1911,1895.717316
35057,35057,1963,1895.717316
35058,35058,2033,1898.796714
35059,35059,2132,2075.219835
35060,35060,2304,2531.420622


In [142]:
# Print final R^2 value
print(rfr.score(X_test, y_test))

0.8200887389063859


In [143]:
# Save CSV
export_csv = df.to_csv("Actual vs Predicted Load.csv", index=True)

In [144]:
# Write/Pickle Model
rf_model = open("Random_Forest_Model.pkl", "wb")
pickle.dump(rfr, rf_model)
rf_model.close()

In [145]:
# Read Model from Pickle
import_model = open("Random_Forest_Model.pkl", "rb")
model = pickle.load(import_model)
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features=0.8, max_leaf_nodes=200, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [146]:
# Check Features
model.feature_importances_

array([2.41715243e-01, 1.44710973e-01, 7.82448615e-03, 1.77875120e-03,
       0.00000000e+00, 2.56551351e-02, 7.76911207e-06, 7.28614296e-05,
       9.12896266e-05, 4.58371114e-05, 2.84329513e-05, 1.68337317e-02,
       5.68510065e-02, 5.04384483e-01])

In [147]:
# Temp, Dew Point, Humidity, Cloud Cover, 
# day_0, day_1, day_2, day_3, day_4, day_5,
# day_6, day_7, day_count, hour
# Actual Load for below conditions = 2096
weather_conditions = [[41, 35, 79, 30, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]]
model.predict(weather_conditions)

array([1966.47625742])