# Import packages

In [209]:
import glob
import pandas as pd
import numpy as np
import sklearn as sk
import datetime as dt
import matplotlib.pyplot as plt
import time

In [160]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Read in data

In [59]:
# Create list of house names based on folder names
houses = ['house_' + str(i) for i in range(1,7)]

In [7]:
# Create empty filenames list of lists for each house
filenames = [[]]*6

In [44]:
for i in range(6):
    filenames[i] = glob.glob('./REDD/low_freq/house_' + str(i+1) + '/*.dat')
    
    # Drop labels file path
    filenames[i] = filenames[i][:-1]
    
    # Append 0 to single digit channels for sorting purposes (otherwise 10,11,... appears before 2,3,...)
    for j in range(len(filenames[i])):
        if len(filenames[i][j]) == 37:
            filenames[i][j] = filenames[i][j][:32] + '0' + filenames[i][j][32:]
    
    # Sort so that single digit channels appear first
    filenames[i].sort()
    
    # Remove 0 from single digit channels to return accurate channel names
    for j in range(9):
        filenames[i][j] = filenames[i][j][:32] + filenames[i][j][33:]

In [47]:
# Create dictionary of every house with a corresponding list of appliance file paths
filenames_dict = dict(zip(houses,filenames))

In [49]:
labels = [[]]*6

In [53]:
for i in range(6):
    df_temp = pd.read_csv('./REDD/low_freq/house_' + str(i+1) + '/labels.dat', names = ['Labels'])
    labels[i] = df_temp['Labels'].tolist()
    
    if len(labels[i]) >= 9:
        for j in range(9):
            labels[i][j] = '0' + labels[i][j]

In [55]:
labels_dict = dict(zip(houses,labels))

In [77]:
column_names = ['Date','Power']

df_list = [[]]*6
df_house_dict_list = [[]]*6

# Create house dictionary of {label : dataframe} for each appliance in each house, then store as list
for i, house in enumerate(houses):
    df_list[i] = [pd.read_csv(file,sep = ' ',names = column_names) for file in filenames_dict[house]]
    df_dict_list[i] = dict(zip(labels_dict[house],df_list[i]))    

# Store all data in a dictionary of house number and house dataframe

In [78]:
# For each appliance in each house dictionary
for i in range(6):
    for key in df_dict_list[i]:
        
        # Set date column as index
        df_dict_list[i][key]['Date'] = pd.to_datetime(df_dict_list[i][key]['Date'],unit = 's')
        df_dict_list[i][key] = df_dict_list[i][key].set_index(['Date'])
        
        # Relabel Power column as the appliance label
        df_dict_list[i][key].columns = [key]

In [79]:
# Create list of house dataframes
df_total_house_list = [[]]*6

for i in range(6):
    df_total_house_list[i] = pd.concat([df_dict_list[i][key] for key in df_dict_list[i]],
                                    axis = 1,
                                    join = 'inner')

In [108]:
for i in range(6):
    print(df_total_house_list[i].shape)

(406748, 21)
(316840, 12)
(376150, 23)
(428076, 21)
(77451, 27)
(192192, 18)


In [109]:
dict_df_all_houses = dict(zip(houses,df_total_house_list))

# Clean data

In [110]:
for house in houses:
    
    # Create sum of mains entry
    dict_df_all_houses[house]['00 mains'] = dict_df_all_houses[house]['01 mains'] + \
                                            dict_df_all_houses[house]['02 mains']
    
    # Drop original mains columns
    dict_df_all_houses[house] = dict_df_all_houses[house].drop(['01 mains', '02 mains'], axis = 1)
    
    # Sort to bring '00 mains' to first column
    dict_df_all_houses[house] = dict_df_all_houses[house].sort_index(axis=1)

In [111]:
# Filter to leave only columns contributing >5% of total energy

for house in houses:
    frac_total_energy = dict_df_all_houses[house].sum() / dict_df_all_houses[house]['00 mains'].sum()
    over_5_percent = frac_total_energy > 0.05
    dict_df_all_houses[house] = dict_df_all_houses[house][dict_df_all_houses[house].columns[over_5_percent]]

In [130]:
df_io = dict_df_all_houses.copy()

# Convert energy signals to on/off
for house in houses:
    df_io[house].iloc[:,1:] = df_io[house].iloc[:,1:] > 0
    df_io[house].iloc[:,1:] = df_io[house].iloc[:,1:].astype(int)

In [131]:
# Check if given label is "always on" - these need to be excluded from the classification

for house in houses:
    for appliance in df_io[house].columns:
        
        if df_io[house][appliance].sum() == df_io[house].shape[0]:
            print(house, appliance)
            df_io[house] = df_io[house].drop([appliance], axis = 1)

house_1 07 kitchen_outlets
house_1 08 kitchen_outlets
house_1 11 microwave
house_2 t-0
house_3 06 electronics
house_4 04 furance
house_4 13 lighting
house_5 23 lighting
house_6 14 lighting


# Create timeseries features

In [132]:
# Create shifted ten-window time series columns to be used as features

for house in houses:
    for i in range(10):
        df_io[house]['t-' + str(i)] = df_io[house]['00 mains'].shift(i)
    
    # Remove original main total column
    df_io[house] = df_io[house].drop(['00 mains'], axis = 1)
    
    # Drop NaNs, hopefully only from time shifted mains columns
    df_io[house] = df_io[house].dropna(how = 'any')

# Set up training and results storage

In [142]:
# Drop timestamps before shuffling
for house in houses:
    df_io[house] = df_io[house].reset_index(drop = True)

In [200]:
X_dict = dict(zip(houses,[df_io[house].iloc[:,-10:] for house in df_io.keys()]))
Y_all_dict = dict(zip(houses,[df_io[house].iloc[:,:-10] for house in df_io.keys()]))

In [203]:
# Create model dictionary to iterate over
model_names = ['logreg', 'nb', 'knn', 'rf']
models = [LogisticRegression(), GaussianNB(), KNeighborsClassifier(n_neighbors = 3), RandomForestClassifier()]

model_dict = dict(zip(model_names, models))

In [204]:
# Create metrics dictionary. These metrics are most relevant given most appliances are highly skewed (on or off).

metrics = ['precision','recall','f1 score']
metric_objects = [sk.metrics.precision_score,
                  sk.metrics.recall_score,
                  sk.metrics.f1_score]

metric_dict = dict(zip(metrics, metric_objects))

In [251]:
blank_data = np.zeros((len(model_names),len(metrics)))

In [307]:
results = dict(zip(houses,[{} for _ in range(6)]))
results

{'house_1': {},
 'house_2': {},
 'house_3': {},
 'house_4': {},
 'house_5': {},
 'house_6': {}}

In [308]:
df_blank = pd.DataFrame(data = blank_data, columns = metrics, index = model_names)

for house in houses:
    
    for appliance in Y_all_dict[house].columns:
        
        results[house][appliance] = df_blank.copy()

{'house_1': {'05 refrigerator':         precision  recall  f1 score
  logreg        0.0     0.0       0.0
  nb            0.0     0.0       0.0
  knn           0.0     0.0       0.0
  rf            0.0     0.0       0.0,
  '06 dishwasher':         precision  recall  f1 score
  logreg        0.0     0.0       0.0
  nb            0.0     0.0       0.0
  knn           0.0     0.0       0.0
  rf            0.0     0.0       0.0,
  '09 lighting':         precision  recall  f1 score
  logreg        0.0     0.0       0.0
  nb            0.0     0.0       0.0
  knn           0.0     0.0       0.0
  rf            0.0     0.0       0.0,
  '20 washer_dryer':         precision  recall  f1 score
  logreg        0.0     0.0       0.0
  nb            0.0     0.0       0.0
  knn           0.0     0.0       0.0
  rf            0.0     0.0       0.0},
 'house_2': {'04 lighting':         precision  recall  f1 score
  logreg        0.0     0.0       0.0
  nb            0.0     0.0       0.0
  knn         

In [256]:
Y_all_dict['house_1'].columns

Index(['05 refrigerator', '06 dishwasher', '09 lighting', '20 washer_dryer'], dtype='object')

# Train all models. WARNING: Takes >10 min.

In [310]:
for house in houses:
    print('Training for: ', house)
    # Create training and test splits using 20%
    X_train, X_test, Y_train_all, Y_test_all = \
        train_test_split(X_dict[house], Y_all_dict[house], test_size=0.2, random_state=100)

    for appliance in Y_train_all.columns:
        
        print(' '*2, 'Training for: ', appliance)
        
        # Set each Y as one appliance at a time
        Y_train = Y_train_all[appliance]
        Y_test = Y_test_all[appliance]
        
        for model in model_names:
            
            print(' '*4, 'Training model: ', model)
            start = time.time()
            
            # Train each model
            model_dict[model].fit(X_train, Y_train)
            
            # Predict Y values
            Y_pred = model_dict[model].predict(X_test)
            
            for metric in metrics:
                results[house][appliance][metric][model] = metric_dict[metric](Y_test, Y_pred)
            
            finish = time.time()
            print(' '*6, 'Finished training model:', model, 'in', finish-start, 's.')

Training for:  house_1
   Training for:  05 refrigerator
     Training model:  logreg
       Finished training model: logreg in 9.988631010055542 s.
     Training model:  nb
       Finished training model: nb in 0.22813796997070312 s.
     Training model:  knn
       Finished training model: knn in 18.200364112854004 s.
     Training model:  rf
       Finished training model: rf in 4.866636037826538 s.
   Training for:  06 dishwasher
     Training model:  logreg
       Finished training model: logreg in 12.95094609260559 s.
     Training model:  nb
       Finished training model: nb in 0.1626570224761963 s.
     Training model:  knn
       Finished training model: knn in 17.71109890937805 s.
     Training model:  rf
       Finished training model: rf in 14.195065021514893 s.
   Training for:  09 lighting
     Training model:  logreg
       Finished training model: logreg in 12.367313146591187 s.
     Training model:  nb
       Finished training model: nb in 0.16730427742004395 s.
     

In [311]:
results['house_1']

{'05 refrigerator':         precision    recall  f1 score
 logreg   0.999889  0.999988  0.999939
 nb       1.000000  0.982493  0.991169
 knn      0.999889  0.999988  0.999939
 rf       0.999939  1.000000  0.999969,
 '06 dishwasher':         precision    recall  f1 score
 logreg   0.432749  0.010045  0.019634
 nb       0.453154  0.269173  0.337733
 knn      0.670433  0.563866  0.612549
 rf       0.816483  0.521786  0.636687,
 '09 lighting':         precision    recall  f1 score
 logreg   0.941533  0.999974  0.969874
 nb       0.942531  0.963390  0.952846
 knn      0.967393  0.987388  0.977288
 rf       0.969197  0.992937  0.980923,
 '20 washer_dryer':         precision    recall  f1 score
 logreg   0.407534  0.117011  0.181818
 nb       0.390778  0.983284  0.559284
 knn      0.964611  0.938053  0.951147
 rf       0.979529  0.941003  0.959880}

# Create summary table

In [312]:
scores = ['worst case', 'average']
blank_data = np.zeros((len(model_names), len(metrics)*len(scores)))
iterables = [metrics, scores]
column_labels = pd.MultiIndex.from_product(iterables, names = ['metrics', 'type'])

summary_results = pd.DataFrame(data = blank_data, columns = column_labels, index = model_names)
summary_results

metrics,precision,precision,recall,recall,f1 score,f1 score
type,worst case,average,worst case,average,worst case,average
logreg,0.0,0.0,0.0,0.0,0.0,0.0
nb,0.0,0.0,0.0,0.0,0.0,0.0
knn,0.0,0.0,0.0,0.0,0.0,0.0
rf,0.0,0.0,0.0,0.0,0.0,0.0


In [190]:
summary_results['precision', 'average']['logreg']

0.0

In [313]:
for model in model_names:
    for metric in metrics:
        
        all_appliance_list = []
        
        for house in houses:
            for appliance in Y_all_dict[house].columns:
                
                # Create list of all results for a specific model and metric for every appliance in every house
                all_appliance_list = all_appliance_list + [results[house][appliance][metric][model]]
          
        summary_results[metric, 'worst case'][model] = min(all_appliance_list)
        summary_results[metric, 'average'][model] = np.mean(all_appliance_list)

In [314]:
summary_results

metrics,precision,precision,recall,recall,f1 score,f1 score
type,worst case,average,worst case,average,worst case,average
logreg,0.407534,0.775528,0.000138,0.610795,0.000276,0.598997
nb,0.34724,0.765889,0.145753,0.76387,0.205322,0.727799
knn,0.670433,0.897129,0.563866,0.876499,0.612549,0.886287
rf,0.757942,0.927926,0.521786,0.870415,0.636687,0.896037
