In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve, mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report


In [3]:
data = pd.read_csv('./training_data.csv')
data.head()

Unnamed: 0,date,year,month,day,weekday,hour,demand,temperature
0,2003/3/1,2003,3,1,7,1,12863.0,23
1,2003/3/1,2003,3,1,7,2,12389.0,22
2,2003/3/1,2003,3,1,7,3,12155.0,21
3,2003/3/1,2003,3,1,7,4,12072.0,21
4,2003/3/1,2003,3,1,7,5,12160.0,22


In [4]:
X = data.drop(['demand','date'], axis=1)
y = data['demand']

In [5]:
np.random.seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20755, 6), (83021, 6), (20755,), (83021,))

In [6]:
model = RandomForestRegressor()
model.fit(X_train,y_train)

RandomForestRegressor()

In [7]:
model.score(X_test, y_test)

0.9580371913911857

In [8]:
y_preds = model.predict(X_test)

In [9]:
mean_absolute_error(y_test, y_preds)

394.91357367413065

In [10]:
mean_squared_error(y_test, y_preds)

352194.52096848516

In [11]:
result = np.array([y_test, y_preds])

In [12]:
result

array([[14348.  , 12138.  , 15610.  , ..., 21100.  , 15891.  , 19014.  ],
       [14597.84, 12766.44, 15584.47, ..., 20251.99, 15861.1 , 20187.33]])

In [13]:
result = pd.DataFrame(result)

In [14]:
result = result.T

In [15]:
result.columns = ['Actual Value', 'Predicted Value']

In [16]:
result.head()

Unnamed: 0,Actual Value,Predicted Value
0,14348.0,14597.84
1,12138.0,12766.44
2,15610.0,15584.47
3,16541.0,16270.73
4,22138.0,20964.13


In [17]:
sample = pd.read_csv('./test_data.csv')
sample.head()

Unnamed: 0,date,year,month,day,weekday,hour,demand,temperature
0,2003/3/1,2003,3,1,7,1,12863,23
1,2003/3/1,2003,3,1,7,2,12389,22
2,2003/3/1,2003,3,1,7,3,12155,21
3,2003/3/1,2003,3,1,7,4,12072,21
4,2003/3/1,2003,3,1,7,5,12160,22


In [18]:
y_test = sample['demand']
X_test = sample.drop(['date','demand'], axis=1)

In [19]:
y_preds = model.predict(X_test)

In [20]:
y_preds

array([12865.72, 12722.6 , 12572.29, 12531.17, 12464.14, 12699.85,
       13300.54, 14204.23, 15452.41, 15819.15, 15741.5 , 15466.94,
       14937.89, 14431.25, 14401.7 , 14337.93, 14952.57, 15055.32,
       15660.5 , 15626.2 , 15777.18, 15032.56, 13929.47, 12792.14,
       12247.42, 13014.35, 11255.95, 11241.25, 11247.94, 10900.04,
       11506.77, 12245.23, 13291.8 , 13516.49, 15006.52, 15017.78,
       14746.89, 14486.3 , 14442.2 , 14727.73, 14801.8 , 15224.07,
       15359.61, 15785.16, 15578.83, 14162.12, 13365.08, 12100.07,
       11563.56, 11445.81, 11391.51, 11486.67, 11909.61, 13696.86,
       16142.87, 18001.84, 18401.08, 18529.11, 18647.9 , 18648.44,
       18169.9 , 18079.69, 17741.7 , 17718.3 , 19465.13, 19866.99,
       20056.2 , 19736.14, 18972.3 , 18252.7 , 16706.98, 15738.36,
       14506.53, 14465.98, 14387.62, 14356.69, 14410.53, 15178.13,
       17395.99, 18520.65, 18702.12, 18228.46, 17820.62, 17683.21,
       17577.7 , 17248.73, 16961.5 , 16754.51, 17109.74, 17675

In [21]:
result = pd.DataFrame(np.array([X_test['year'],y_test, y_preds]))
result = result.T

In [22]:
result

Unnamed: 0,0,1,2
0,2003.0,12863.0,12865.72
1,2003.0,12389.0,12722.60
2,2003.0,12155.0,12572.29
3,2003.0,12072.0,12531.17
4,2003.0,12160.0,12464.14
...,...,...,...
163,2003.0,17789.0,17927.14
164,2003.0,17137.0,17445.72
165,2003.0,16260.0,16972.61
166,2003.0,15039.0,15326.28


In [23]:
result.to_csv('./result.csv')

In [24]:
pickle.dump(model, open('./stlf_model', 'wb'))