In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import pickle

%matplotlib inline

In [2]:
df_home = pd.read_csv('home_energy.csv', parse_dates= True, squeeze= True)
df_home['Time'] = pd.to_datetime(df_home['Time'])

In [3]:
df_home = df_home[['Equipment', 'Time', 'Condition']]
df_home.isna().sum()

Equipment    0
Time         0
Condition    0
dtype: int64

In [4]:
df_home.head()

Unnamed: 0,Equipment,Time,Condition
0,Device 1,2020-03-07 02:03:50,ON
1,Device 1,2020-03-07 05:44:10,ON
2,Device 1,2020-03-07 07:56:38,ON
3,Device 1,2020-03-07 08:55:41,OFF
4,Device 1,2020-03-07 15:15:50,ON


In [5]:
#make a series of the key data points and then index it 
time_serie = pd.Series(df_home['Condition'].values, index= df_home['Time'])
df_home['Hour'] = time_serie.index.hour
df_home['Minute'] = time_serie.index.minute
df_home['Seconds'] = time_serie.index.second

time_serie.head()

Time
2020-03-07 02:03:50     ON
2020-03-07 05:44:10     ON
2020-03-07 07:56:38     ON
2020-03-07 08:55:41    OFF
2020-03-07 15:15:50     ON
dtype: object

In [6]:
df_home.head()

Unnamed: 0,Equipment,Time,Condition,Hour,Minute,Seconds
0,Device 1,2020-03-07 02:03:50,ON,2,3,50
1,Device 1,2020-03-07 05:44:10,ON,5,44,10
2,Device 1,2020-03-07 07:56:38,ON,7,56,38
3,Device 1,2020-03-07 08:55:41,OFF,8,55,41
4,Device 1,2020-03-07 15:15:50,ON,15,15,50


In [7]:
df_home = pd.get_dummies(df_home, columns=['Equipment'], prefix=['Equipment'])

In [8]:
df_home.head()

Unnamed: 0,Time,Condition,Hour,Minute,Seconds,Equipment_Device 1,Equipment_Device 2,Equipment_Device 3,Equipment_Device 4
0,2020-03-07 02:03:50,ON,2,3,50,1,0,0,0
1,2020-03-07 05:44:10,ON,5,44,10,1,0,0,0
2,2020-03-07 07:56:38,ON,7,56,38,1,0,0,0
3,2020-03-07 08:55:41,OFF,8,55,41,1,0,0,0
4,2020-03-07 15:15:50,ON,15,15,50,1,0,0,0


In [9]:
y = df_home['Condition'].replace(['ON', 'OFF'], [0,1])
x = df_home.drop(axis= 1, columns= ['Condition', 'Time'])

In [10]:
x.head()

Unnamed: 0,Hour,Minute,Seconds,Equipment_Device 1,Equipment_Device 2,Equipment_Device 3,Equipment_Device 4
0,2,3,50,1,0,0,0
1,5,44,10,1,0,0,0
2,7,56,38,1,0,0,0
3,8,55,41,1,0,0,0
4,15,15,50,1,0,0,0


In [11]:
y.value_counts()

0    275
1    239
Name: Condition, dtype: int64

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=7,test_size=0.2)
classifier = RandomForestClassifier(random_state=42)

In [14]:
classifier.fit(x_train, y_train)
y_preds = classifier.predict(x_test)

print('The accuracy score is', accuracy_score(y_test, y_preds))

The accuracy score is 0.6796116504854369


In [15]:
print(classifier.get_params)

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)>


In [None]:
# Number of trees in random forest
n_estimators = np.arange(10, 100, 10)
# Number of features to consider at every split
max_features = ['auto', 'sqrt', "log2", 1,2,0.2,6]
# Maximum number of levels in tree
max_depth = np.arange(1, 20)
# Minimum number of samples required to split a node
min_samples_split = np.arange(1, 20)
# Minimum number of samples required at each leaf node
min_samples_leaf = np.arange(1, 10)
# Method of selecting samples for training each tree
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

rf_random = GridSearchCV(estimator = classifier, param_grid = random_grid, cv = 3, verbose=2, n_jobs = -1)
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 204687 candidates, totalling 614061 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 244 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 650 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done 1216 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done 1946 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 2836 tasks      | elapsed:   54.8s
[Parallel(n_jobs=-1)]: Done 3890 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 5104 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 6482 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 8020 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 9722 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 11584 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 13610 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 15796 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 18146 tasks   

In [None]:
#analysing the random forest algorithm
rf_random.best_params_

In [None]:
y_preds_random = rf_random.predict(x_test)

print('The accuracy score is', accuracy_score(y_test, y_preds_random))

In [None]:
#saving the model in a pickle file
file_name = "random_forest_model.pkl"
with open(file_name, 'wb') as file:
    pickle.dump(rf_random.best_estimator_, file)