## INTRODUCTION:

The objective is to develop classification models for different appliances used in the household. The load monitoring is done to provide detailed electricity consumption information and usage of individual appliances in residential buildings. Our aim is to develop a classifier to detect whether the target appliances are being used in each time interval. Python language is used to develop the classification models as it provides a wide variety of libraries for time series, feature extraction and machine learning algorithms.

In [1]:
import pandas as pd

from sklearn.metrics import PrecisionRecallDisplay

%matplotlib inline
import re
from io import StringIO
import csv
import numpy as np
import string
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.metrics import precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
#from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
# from sklearn.datasets import make_classification
from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv  # noqa
from tsfresh import extract_features
import itertools
import torch # for getting information if GPU is present



# Reading the csv files for initial cleaning of the data
train_df = pd.read_csv("train_data_withlabels.csv")
test_df = pd.read_csv("test_data_nolabels.csv")


# Function for splitting the train data and putting it in dictionary
def creating_data_dicts(train_data, train_data_dict,
                        test_data, test_data_dict):
    
    '''
    This function would split the train dataframe into 1000 rows
    and complete rows and the insert it in the dictionary with
    appropriate key.
    Params:
    1. train_data: This the train data which would be splitted.
    2. train_data_dict: This is the dictionary in which the data is inserted.
    3. for_data: This indicates the type of data for which we would be creating
                 the split.
    
    Returns
    1. train_data_dict: The train data dict would be returned with the split
                        inserted in it.
    '''

    # Static part of the keys for the dictionaries
    class_list = ['ac', 'ev', 'oven', 'wash', 'dryer']

    # Creating the keys for the resultant dictionary
    # with concatenating the static and dynamic part
    key_list_train = [item + '_' + 'trainData' for item in class_list]

    # Looping to create different dataframes with different classes
    for i in range(len(key_list_train)):
        
        train_data_label = train_data.rename(columns =\
                                                     {class_list[i]: 'label'})
        
        train_data_dict[key_list_train[i]] = train_data_label[['Unnamed: 0','load','hourofday','dayofweek',\
                                                         'dif','absdif','max','var','entropy',\
                                                         'nonlinear','hurst','label']]
        
    key_list_test = [item + '_' + 'testData' for item in class_list]
    
    
    # Looping to create different dataframes with different classes
    for i in range(len(key_list_test)):
        
        test_data_dict[key_list_test[i]] = test_data
    

    return train_data_dict, test_data_dict



train_data_dict = dict()
test_data_dict = dict()
train_data_dict, test_data_dict = creating_data_dicts(train_df, train_data_dict,
                                                      test_df, test_data_dict)


# # Fucntion to perform cleaning in the dataframe
def cleaning_df(df,df_type = 'Train Dataframe'):
    
    '''
    This function is used to perform the basic dataframe cleaning like
        * Removing the columns which are not required for prediction
        * Removing the duplicates
        * Removing the rows with NA values:
            * For train dataframe all the required columns will be checked.
            * For test dataframe only Abstract column will be checked.
        
        Counts before and after the cleaning will be displayed.
    Params:
    1. df: The datfarame that needs to be cleaned.
    2. df_type: The type of df that requires cleaning i.e. Training or Testing
    '''    
    
    print(f'Number of rows before removing duplicates from {df_type}:', \
          len(df))
    
    if df_type != 'Test Dataframe':
        # Removing duplicates in case of train dataframe
        df.drop_duplicates(inplace=True)
        print(f'Number of rows after removing duplicates from {df_type}:', \
              len(df))
        print(f'Number of rows before removing missing values from {df_type}:',\
          len(df))
        # Removing the NAs in the train dataframe
        df = df.dropna()
    else:
        print(f'Number of rows before removing missing values from {df_type}:',\
          len(df))
        # Removing the NAs in the test dataframe
        df = df.dropna()

    print(f'Number of rows after removing missing values from {df_type}:',\
          len(df))
    
    print('\n')

    return df

for key, item in train_data_dict.items():
    key_split = key.split('_')
    print(f'The below statements are for the dataframe for {key_split[0]}')
    train_data_dict[key] = cleaning_df(item, df_type = 'Train Dataframe')
    
for key, item in test_data_dict.items():
    key_split = key.split('_')
    print(f'The below statements are for the dataframe for {key_split[0]}')
    test_data_dict[key] = cleaning_df(item, df_type = 'Test Dataframe')
    
    

def one_hot_encoding(df, column_list, prefixes_list):
    
    y = []
    y.append(df)
    for i in range(len(column_list)):
        y.append(pd.get_dummies(df[column_list[i]], prefix=prefixes_list[i]))
        y[i+1] = y[i+1].drop(columns=y[i+1].columns[0], axis = 1)
    
    y[0] = y[0].drop(column_list, axis = 1)
    
    df = pd.concat(y, axis = 1)
    
    return df


def average_window_appliance_was_on(train_df):
    
    n = train_df.groupby((train_df['label']!=\
                              train_df['label'].shift(1)).\
                             cumsum()).count().shape[0]
    
    if not n%2 == 0:
        return round(train_df['label'].value_counts()[1]/((n-1)/2))
    else:
        return round(train_df['label'].value_counts()[1]/(n/2))

    
def create_ts_features(df, window):

    df = df.rename(columns = {'Unnamed: 0': 'time'})
    
    df['id'] = ((df.index/window) + 1).astype(int)

    timeseries = df[['id','time', 'load']]

    settings =  {'variance_larger_than_standard_deviation': None,
                'abs_energy': None,
                'mean_abs_change': None,
                'mean_change': None,
                'mean_second_derivative_central': None,
                'median': None,
                'standard_deviation': None,
                'variation_coefficient': None,
                'variance': None,
                'root_mean_square': None,
                'benford_correlation': None,
                'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
                'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
                'large_standard_deviation': [{'r': 0.05},
                 {'r': 0.1}],
                'linear_trend': [{'attr': 'pvalue'},
                 {'attr': 'rvalue'},
                 {'attr': 'intercept'},
                 {'attr': 'slope'},
                 {'attr': 'stderr'}],
                'linear_trend_timewise': [{'attr': 'pvalue'},
                 {'attr': 'rvalue'},
                 {'attr': 'intercept'},
                 {'attr': 'slope'},
                 {'attr': 'stderr'}]}
    
    extracted_features = extract_features(timeseries, column_id="id",\
                                          column_sort="time", n_jobs = 20,\
                                          default_fc_parameters=settings)
    
    extracted_features['id'] = extracted_features.index
    
    df = pd.merge(df, extracted_features, how="left",on='id')
    
    df = df.dropna(axis=1)
    
    df = df.drop(['time','id'], axis=1)
    
    return df


def logisticRegression_model(x_train, y_train, x_test, label):
    
    model = LogisticRegression(random_state=111, n_jobs = -1, C = 1,\
                               max_iter = 2000, solver='newton-cg')\
    
    model_name = model.__class__.__name__
    
    print('Building ' + model_name + ' model for label: ' + label + '\n')
    
    model.fit(x_train, y_train)
    
    print('Finished building ' + model_name + ' model for label: ' + label + '\n')
#     y_predict = model.predict(x_test)
    
    return model


def catboost_model(x_train, y_train, x_test, label):
    
    ylist = np.array(y_train)
    model = CatBoostClassifier(random_state=111, task_type = 'GPU',\
                               class_weights=[1.0,(len(ylist[ylist == 0])/len(ylist[ylist == 1]))/2],
                               iterations=50, learning_rate=0.1, max_depth = 13)

    model_name = model.__class__.__name__
    
    print('Building ' + model_name + ' model for label: ' + label + '\n')
    
    model.fit(x_train, y_train)
    
    print('Finished building ' + model_name + ' model for label: ' + label + '\n')
    
#     y_predict = model.predict(x_test)
    
    return model




use_cuda = torch.cuda.is_available()
if use_cuda:
    cb_model_type = 'GPU'
else:
    cb_model_type = 'CPU'

# Getting the list of the keys from the test_data_dict dictionary
test_data_dict_key_list = [*test_data_dict]
i = 0

pred_labels_list = []
predictions_list = []

# Looping over the train_data_dict dictionary
for key, tr_df in train_data_dict.items():
    
    # Picking the ith key from the test_data_dict dictionary
    test_df_key = test_data_dict_key_list[i]
    # Getting the dataframe associated with the test dictionary key
    x_test = test_data_dict[test_df_key]

    # Splitting the key from the train_data_dict
    train_key_split = key.split('_')
    
    # Getting the label
    label = train_key_split[0]

    tr_df = one_hot_encoding(tr_df, ['dayofweek'], ['dayofweek'])

    window = average_window_appliance_was_on(tr_df)
    x_train = create_ts_features(tr_df, window)

    
    x_test = one_hot_encoding(x_test, ['dayofweek'], ['dayofweek'])
    x_test = create_ts_features(x_test, window)

    if label == 'ev':
        model = LogisticRegression(C = 1, max_iter = 2000, solver = 'newton-cg',\
                                   random_state=111, n_jobs = -1)
    else:
        ylist = x_train['label'].values
            # Predicting the target variable with the already tuned hyperparameters
        model = CatBoostClassifier(random_state=111, task_type = cb_model_type,\
                                class_weights=[1.0,len(ylist[ylist == 0])/len(ylist[ylist == 1])],
                                iterations = 50, learning_rate = 0.1, max_depth = 13)

    model_name = model.__class__.__name__
    
    print('Building ' + model_name + ' model for label: ' + label + '\n')
    
    model.fit(x_train.loc[:, x_train.columns.difference(['label'])],\
              np.asarray(x_train['label'].tolist()))
    
    print('Finished building ' + model_name + ' model for label: ' + label + '\n')
        
    # Do the prediction
    pred_labels_list.append(pd.DataFrame({label: model.predict(x_test)}))

    i += 1

pred_labels_df = pd.concat(pred_labels_list, axis = 1)
# pred_labels_df.insert(0, "col.index", list(range(1,(len(pred_labels_df) + 1))))
pred_labels_df.insert(0, "id", list(range(1,(len(pred_labels_df) + 1))))
pred_labels_df.to_csv('pred_labels.csv',index = False)
pred_labels_df.head() #optional can be reomved later

The below statements are for the dataframe for ac
Number of rows before removing duplicates from Train Dataframe: 417720
Number of rows after removing duplicates from Train Dataframe: 417720
Number of rows before removing missing values from Train Dataframe: 417720
Number of rows after removing missing values from Train Dataframe: 417720


The below statements are for the dataframe for ev
Number of rows before removing duplicates from Train Dataframe: 417720
Number of rows after removing duplicates from Train Dataframe: 417720
Number of rows before removing missing values from Train Dataframe: 417720
Number of rows after removing missing values from Train Dataframe: 417720


The below statements are for the dataframe for oven
Number of rows before removing duplicates from Train Dataframe: 417720
Number of rows after removing duplicates from Train Dataframe: 417720
Number of rows before removing missing values from Train Dataframe: 417720
Number of rows after removing missing values fro

Feature Extraction: 100%|██████████| 100/100 [00:25<00:00,  3.92it/s]
Feature Extraction: 100%|██████████| 99/99 [00:05<00:00, 17.74it/s]


Building CatBoostClassifier model for label: ac

0:	learn: 0.4153349	total: 1.4s	remaining: 1m 8s
1:	learn: 0.2518350	total: 3.55s	remaining: 1m 25s
2:	learn: 0.1585842	total: 5.11s	remaining: 1m 20s
3:	learn: 0.1073081	total: 6.45s	remaining: 1m 14s
4:	learn: 0.0795874	total: 7.63s	remaining: 1m 8s
5:	learn: 0.0627084	total: 8.85s	remaining: 1m 4s
6:	learn: 0.0491076	total: 9.98s	remaining: 1m 1s
7:	learn: 0.0400897	total: 11.1s	remaining: 58.5s
8:	learn: 0.0341203	total: 12.4s	remaining: 56.3s
9:	learn: 0.0300008	total: 13.6s	remaining: 54.3s
10:	learn: 0.0270095	total: 14.7s	remaining: 52.2s
11:	learn: 0.0246721	total: 16.1s	remaining: 51s
12:	learn: 0.0230699	total: 17.4s	remaining: 49.5s
13:	learn: 0.0213968	total: 18.5s	remaining: 47.6s
14:	learn: 0.0199259	total: 19.9s	remaining: 46.4s
15:	learn: 0.0192003	total: 21.3s	remaining: 45.2s
16:	learn: 0.0182339	total: 22.3s	remaining: 43.3s
17:	learn: 0.0175672	total: 23.4s	remaining: 41.6s
18:	learn: 0.0167914	total: 24.8s	remaining

Feature Extraction: 100%|██████████| 100/100 [00:08<00:00, 12.37it/s]
Feature Extraction: 100%|██████████| 96/96 [00:01<00:00, 53.94it/s]


Building LogisticRegression model for label: ev

Finished building LogisticRegression model for label: ev



Feature Extraction: 100%|██████████| 100/100 [01:21<00:00,  1.23it/s]
Feature Extraction: 100%|██████████| 100/100 [00:19<00:00,  5.05it/s]


Building CatBoostClassifier model for label: oven

0:	learn: 0.4883140	total: 1.2s	remaining: 58.8s
1:	learn: 0.3545935	total: 2.3s	remaining: 55.2s
2:	learn: 0.2692829	total: 3.51s	remaining: 55s
3:	learn: 0.2126644	total: 4.63s	remaining: 53.3s
4:	learn: 0.1768594	total: 5.69s	remaining: 51.2s
5:	learn: 0.1486666	total: 6.68s	remaining: 49s
6:	learn: 0.1282920	total: 7.72s	remaining: 47.4s
7:	learn: 0.1117640	total: 8.81s	remaining: 46.3s
8:	learn: 0.0984935	total: 9.83s	remaining: 44.8s
9:	learn: 0.0894134	total: 10.9s	remaining: 43.7s
10:	learn: 0.0811792	total: 12s	remaining: 42.7s
11:	learn: 0.0745617	total: 13.2s	remaining: 41.7s
12:	learn: 0.0703409	total: 14.4s	remaining: 40.9s
13:	learn: 0.0642889	total: 15.5s	remaining: 39.9s
14:	learn: 0.0605929	total: 16.5s	remaining: 38.5s
15:	learn: 0.0570588	total: 17.5s	remaining: 37.1s
16:	learn: 0.0536220	total: 18.5s	remaining: 35.9s
17:	learn: 0.0514037	total: 19.6s	remaining: 34.9s
18:	learn: 0.0485824	total: 20.7s	remaining: 33.7

Feature Extraction: 100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
Feature Extraction: 100%|██████████| 100/100 [00:20<00:00,  4.91it/s]


Building CatBoostClassifier model for label: wash

0:	learn: 0.6001776	total: 1.26s	remaining: 1m 1s
1:	learn: 0.5287165	total: 2.52s	remaining: 1m
2:	learn: 0.4807273	total: 4.02s	remaining: 1m 2s
3:	learn: 0.4472775	total: 5.33s	remaining: 1m 1s
4:	learn: 0.4150371	total: 6.8s	remaining: 1m 1s
5:	learn: 0.3865736	total: 8.04s	remaining: 59s
6:	learn: 0.3734562	total: 9.35s	remaining: 57.5s
7:	learn: 0.3609680	total: 10.8s	remaining: 56.7s
8:	learn: 0.3418632	total: 12s	remaining: 54.4s
9:	learn: 0.3282036	total: 13.1s	remaining: 52.3s
10:	learn: 0.3203937	total: 14.2s	remaining: 50.3s
11:	learn: 0.3058220	total: 15.3s	remaining: 48.6s
12:	learn: 0.2973277	total: 16.5s	remaining: 47s
13:	learn: 0.2873282	total: 17.7s	remaining: 45.4s
14:	learn: 0.2730401	total: 18.8s	remaining: 44s
15:	learn: 0.2643064	total: 19.9s	remaining: 42.3s
16:	learn: 0.2517835	total: 21s	remaining: 40.8s
17:	learn: 0.2445017	total: 22.2s	remaining: 39.4s
18:	learn: 0.2376645	total: 23.3s	remaining: 38s
19:	le

Feature Extraction: 100%|██████████| 99/99 [00:06<00:00, 15.37it/s]
Feature Extraction: 100%|██████████| 95/95 [00:01<00:00, 64.54it/s]


Building CatBoostClassifier model for label: dryer

0:	learn: 0.5750133	total: 1.06s	remaining: 52.1s
1:	learn: 0.5075876	total: 2.08s	remaining: 49.9s
2:	learn: 0.4372198	total: 3.63s	remaining: 56.8s
3:	learn: 0.3767262	total: 5.46s	remaining: 1m 2s
4:	learn: 0.3334218	total: 7.03s	remaining: 1m 3s
5:	learn: 0.2909662	total: 8.35s	remaining: 1m 1s
6:	learn: 0.2539043	total: 9.59s	remaining: 58.9s
7:	learn: 0.2321505	total: 10.8s	remaining: 57s
8:	learn: 0.2056101	total: 12.3s	remaining: 55.8s
9:	learn: 0.1858559	total: 13.8s	remaining: 55.3s
10:	learn: 0.1704684	total: 15.1s	remaining: 53.6s
11:	learn: 0.1570879	total: 16.6s	remaining: 52.4s
12:	learn: 0.1413557	total: 17.8s	remaining: 50.6s
13:	learn: 0.1271780	total: 19.1s	remaining: 49.2s
14:	learn: 0.1208051	total: 20.6s	remaining: 48.1s
15:	learn: 0.1128903	total: 22s	remaining: 46.7s
16:	learn: 0.1058970	total: 23.2s	remaining: 45s
17:	learn: 0.0998455	total: 24.3s	remaining: 43.2s
18:	learn: 0.0957092	total: 25.4s	remaining: 4

Unnamed: 0,id,ac,ev,oven,wash,dryer
0,1,0,1,0,0,1
1,2,0,1,0,0,1
2,3,0,1,0,0,1
3,4,0,1,0,1,1
4,5,0,1,0,0,1


In [4]:
import pandas as pd
import numpy as np
import csv
import warnings
import sklearn
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tsfresh import extract_features
import torch # for getting information if GPU is present
from copy import deepcopy


# Reading the training data csv files 
train_df = pd.read_csv("train_data_withlabels.csv")
# Reading the testing data csv files 
test_df = pd.read_csv("test_data_nolabels.csv")


# Function for splitting the train data and putting it in dictionary
def creating_data_dicts(train_data, train_data_dict):
    
    '''
    This function would split the train dataframe into 1000 rows
    and complete rows and the insert it in the dictionary with
    appropriate key.
    Params:
    1. train_data: This the train data which would be splitted.
    2. train_data_dict: This is the dictionary in which the data is inserted.
    3. for_data: This indicates the type of data for which we would be creating
                 the split.
    
    Returns
    1. train_data_dict: The train data dict would be returned with the split
                        inserted in it.
    '''

    # Static part of the keys for the dictionaries
    class_list = ['ac', 'ev', 'oven', 'wash', 'dryer']

    # Creating the keys for the resultant dictionary
    # with concatenating the static and dynamic part
    key_list_train = [item + '_' + 'trainData' for item in class_list]

    # Looping to create different dataframes with different classes
    for i in range(len(key_list_train)):
        # 
        train_data_label = train_data.rename(columns =\
                                                     {class_list[i]: 'label'})
        
        # 
        train_data_dict[key_list_train[i]] = train_data_label[['Unnamed: 0','load','hourofday','dayofweek',\
                                                         'dif','absdif','max','var','entropy',\
                                                         'nonlinear','hurst','label']]
    
    #    
    key_list_test = [item + '_' + 'testData' for item in class_list]

    return train_data_dict


# Initialize a dictionary for the training data 
train_data_dict = dict()
# test_data_dict = dict()
# Creating entries for each label in the training dictionary 
train_data_dict = creating_data_dicts(train_df, train_data_dict)

  

def one_hot_encoding(df, column_list, prefixes_list):
    
    y = []
    y.append(df)
    for i in range(len(column_list)):
        y.append(pd.get_dummies(df[column_list[i]], prefix=prefixes_list[i]))
        y[i+1] = y[i+1].drop(columns=y[i+1].columns[0], axis = 1)
    
    y[0] = y[0].drop(column_list, axis = 1)
    
    df = pd.concat(y, axis = 1)
    
    return df


def average_window_appliance_was_on(train_df):
    
    #
    n = train_df.groupby((train_df['label']!=\
                              train_df['label'].shift(1)).\
                             cumsum()).count().shape[0]
    
    if not n%2 == 0:
        return round(train_df['label'].value_counts()[1]/((n-1)/2))
    else:
        return round(train_df['label'].value_counts()[1]/(n/2))

    
def create_ts_features(df, window):

    # Rename the 'Unnamed' column to 'time'
    df = df.rename(columns = {'Unnamed: 0': 'time'})
    
    # Create a column 'id' that is the 
    df['id'] = ((df.index/window) + 1).astype(int)
    
    # Get the required subset of columns
    timeseries = df[['id','time', 'load']]

    settings =  {'variance_larger_than_standard_deviation': None,
                'abs_energy': None,
                'mean_abs_change': None,
                'mean_change': None,
                'mean_second_derivative_central': None,
                'median': None,
                'standard_deviation': None,
                'variation_coefficient': None,
                'variance': None,
                'root_mean_square': None,
                'benford_correlation': None,
                'time_reversal_asymmetry_statistic': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
                'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
                'large_standard_deviation': [{'r': 0.05},
                 {'r': 0.1}],
                'linear_trend': [{'attr': 'pvalue'},
                 {'attr': 'rvalue'},
                 {'attr': 'intercept'},
                 {'attr': 'slope'},
                 {'attr': 'stderr'}],
                'linear_trend_timewise': [{'attr': 'pvalue'},
                 {'attr': 'rvalue'},
                 {'attr': 'intercept'},
                 {'attr': 'slope'},
                 {'attr': 'stderr'}]}
    
    #
    extracted_features = extract_features(timeseries, column_id="id",\
                                          column_sort="time", n_jobs = 20,\
                                          default_fc_parameters=settings)
    
    #
    extracted_features['id'] = extracted_features.index
    
    # 
    df = pd.merge(df, extracted_features, how="left",on='id')
    
    # Dropping all nas
    df = df.dropna(axis=1)
    
    # Dropping the time and id columns 
    df = df.drop(['time','id'], axis=1)
    
    return df



# Check if gpu is available 
use_cuda = torch.cuda.is_available()
if use_cuda:
    
    cb_model_type = 'GPU'
else:
    cb_model_type = 'CPU'

# Getting the list of the keys from the test_data_dict dictionary
# test_data_dict_key_list = [*test_data_dict]

# i = 0
# Initialize list of predictions 
pred_labels_list = []
# # Initialize list of predictions 
# predictions_list = []

# Looping over the train_data_dict dictionary
for key, tr_df in train_data_dict.items():

    # Splitting the key from the train_data_dict
    train_key_split = key.split('_')
    # Get deep copy of the test dataframe 
    x_test = deepcopy(test_df)
    # Getting the appliance label from the train key
    label = train_key_split[0]
    
    # Perform one hot encoding for the categorical column in the training data
    tr_df = one_hot_encoding(tr_df, ['dayofweek'], ['dayofweek'])
    
    # Get the average window size for each appliance 
    window = average_window_appliance_was_on(tr_df)
    # Create additional ts features for given training data
    x_train = create_ts_features(tr_df, window)

    print('Window size for Label : '+ str(label) + ' is =' +str(window))
    
    
    # Perform one hote encoding for the categorical column in the testing data
    x_test = one_hot_encoding(x_test, ['dayofweek'], ['dayofweek'])
    # Create additional ts features for the test data 
    x_test = create_ts_features(x_test, window)
    
    # For 'ev' appliance we create a LogisticRegression model, for all other labels we use CatBoost model 
    if label == 'ev':
        # Create logistice regression model with the optimized hyperparameters 
        model = LogisticRegression(C = 1, max_iter = 2000, solver = 'newton-cg',\
                                   random_state=111, n_jobs = -1)
    else:
        # Get list of labels values. 
        ylist = x_train['label'].values
        # Predicting the target variable with the already tuned hyperparameters
        # We add class weigths for the labels to account for the huge class imbalance present in each label. 
        model = CatBoostClassifier(random_state=111, task_type = cb_model_type,\
                                class_weights=[1.0,len(ylist[ylist == 0])/len(ylist[ylist == 1])],
                                iterations = 50, learning_rate = 0.1, max_depth = 13)
    # Get model name 
    model_name = model.__class__.__name__
    
    print('Building ' + model_name + ' model for label: ' + label + '\n')
    
    # Fit the model on the training data 
    model.fit(x_train.loc[:, x_train.columns.difference(['label'])],\
              np.asarray(x_train['label'].tolist()))
    
    print('Finished building ' + model_name + ' model for label: ' + label + '\n')
    print('Start prediction using ' + model_name + ' model for label: ' + label + '\n')
    # Get the predictions for the given label and append it to the predictions list 
    pred_labels_list.append(pd.DataFrame({label: model.predict(x_test)}))
    print('Finished prediction using ' + model_name + ' model for label: ' + label + '\n')

# Create a dataframe from the predictions list
pred_labels_df = pd.concat(pred_labels_list, axis = 1)
# 
pred_labels_df.insert(0, "id", list(range(1,(len(pred_labels_df) + 1))))
# Write the predictions dataframe to a csv 
pred_labels_df.to_csv('pred_labels.csv',index = False)
pred_labels_df.head() #optional can be reomved later

Feature Extraction: 100%|██████████| 100/100 [00:23<00:00,  4.25it/s]


Window size for Label : ac is =21


Feature Extraction: 100%|██████████| 99/99 [00:05<00:00, 18.23it/s]


Building CatBoostClassifier model for label: ac

0:	learn: 0.4153349	total: 1.19s	remaining: 58.2s
1:	learn: 0.2518350	total: 2.33s	remaining: 56s
2:	learn: 0.1585842	total: 3.6s	remaining: 56.3s
3:	learn: 0.1073081	total: 4.75s	remaining: 54.6s
4:	learn: 0.0795874	total: 5.81s	remaining: 52.3s
5:	learn: 0.0627084	total: 7.09s	remaining: 52s
6:	learn: 0.0491076	total: 8.21s	remaining: 50.4s
7:	learn: 0.0400897	total: 9.37s	remaining: 49.2s
8:	learn: 0.0341203	total: 10.5s	remaining: 47.7s
9:	learn: 0.0300008	total: 11.6s	remaining: 46.4s
10:	learn: 0.0270095	total: 12.8s	remaining: 45.3s
11:	learn: 0.0246721	total: 13.8s	remaining: 43.8s
12:	learn: 0.0230699	total: 14.9s	remaining: 42.4s
13:	learn: 0.0213968	total: 16s	remaining: 41.1s
14:	learn: 0.0199259	total: 17s	remaining: 39.7s
15:	learn: 0.0192003	total: 18.1s	remaining: 38.5s
16:	learn: 0.0182339	total: 19.2s	remaining: 37.3s
17:	learn: 0.0175672	total: 20.3s	remaining: 36s
18:	learn: 0.0167914	total: 21.3s	remaining: 34.8s
19:

Feature Extraction: 100%|██████████| 100/100 [00:06<00:00, 15.71it/s]


Window size for Label : ev is =69


Feature Extraction: 100%|██████████| 96/96 [00:01<00:00, 58.49it/s]


Building LogisticRegression model for label: ev

Finished building LogisticRegression model for label: ev

Start prediction using LogisticRegression model for label: ev

Finished prediction using LogisticRegression model for label: ev



Feature Extraction: 100%|██████████| 100/100 [01:01<00:00,  1.62it/s]


Window size for Label : oven is =6


Feature Extraction: 100%|██████████| 100/100 [00:15<00:00,  6.58it/s]


Building CatBoostClassifier model for label: oven

0:	learn: 0.4883140	total: 929ms	remaining: 45.5s
1:	learn: 0.3545935	total: 1.84s	remaining: 44.3s
2:	learn: 0.2692829	total: 2.79s	remaining: 43.8s
3:	learn: 0.2126644	total: 3.82s	remaining: 44s
4:	learn: 0.1768594	total: 4.76s	remaining: 42.9s
5:	learn: 0.1486666	total: 5.82s	remaining: 42.7s
6:	learn: 0.1282920	total: 6.88s	remaining: 42.3s
7:	learn: 0.1117640	total: 7.82s	remaining: 41s
8:	learn: 0.0984935	total: 8.75s	remaining: 39.9s
9:	learn: 0.0894134	total: 9.71s	remaining: 38.8s
10:	learn: 0.0811792	total: 10.7s	remaining: 37.8s
11:	learn: 0.0745617	total: 11.7s	remaining: 36.9s
12:	learn: 0.0703409	total: 12.6s	remaining: 36s
13:	learn: 0.0642889	total: 13.6s	remaining: 34.9s
14:	learn: 0.0605929	total: 14.6s	remaining: 34s
15:	learn: 0.0570588	total: 15.5s	remaining: 32.9s
16:	learn: 0.0536220	total: 16.4s	remaining: 31.8s
17:	learn: 0.0514037	total: 17.4s	remaining: 30.9s
18:	learn: 0.0485824	total: 18.3s	remaining: 29.9

Feature Extraction: 100%|██████████| 100/100 [01:01<00:00,  1.64it/s]


Window size for Label : wash is =6


Feature Extraction: 100%|██████████| 100/100 [00:17<00:00,  5.68it/s]


Building CatBoostClassifier model for label: wash

0:	learn: 0.6001776	total: 1.19s	remaining: 58.1s
1:	learn: 0.5287165	total: 2.27s	remaining: 54.4s
2:	learn: 0.4807273	total: 3.57s	remaining: 56s
3:	learn: 0.4472775	total: 5.31s	remaining: 1m 1s
4:	learn: 0.4150371	total: 7.05s	remaining: 1m 3s
5:	learn: 0.3865736	total: 8.77s	remaining: 1m 4s
6:	learn: 0.3734562	total: 10.4s	remaining: 1m 3s
7:	learn: 0.3609680	total: 11.9s	remaining: 1m 2s
8:	learn: 0.3418632	total: 13.2s	remaining: 1m
9:	learn: 0.3282036	total: 15s	remaining: 59.9s
10:	learn: 0.3203937	total: 16.6s	remaining: 58.9s
11:	learn: 0.3058220	total: 18.2s	remaining: 57.6s
12:	learn: 0.2973277	total: 19.6s	remaining: 55.7s
13:	learn: 0.2873282	total: 20.8s	remaining: 53.6s
14:	learn: 0.2730401	total: 22s	remaining: 51.4s
15:	learn: 0.2643064	total: 23.2s	remaining: 49.3s
16:	learn: 0.2517835	total: 24.4s	remaining: 47.3s
17:	learn: 0.2445017	total: 25.5s	remaining: 45.3s
18:	learn: 0.2376645	total: 26.6s	remaining: 43.4s

Feature Extraction: 100%|██████████| 99/99 [00:04<00:00, 19.86it/s]


Window size for Label : dryer is =80


Feature Extraction: 100%|██████████| 95/95 [00:01<00:00, 64.98it/s]


Building CatBoostClassifier model for label: dryer

0:	learn: 0.5750133	total: 1.37s	remaining: 1m 7s
1:	learn: 0.5075876	total: 2.33s	remaining: 56s
2:	learn: 0.4372198	total: 3.25s	remaining: 51s
3:	learn: 0.3767262	total: 4.55s	remaining: 52.3s
4:	learn: 0.3334218	total: 5.57s	remaining: 50.1s
5:	learn: 0.2909662	total: 6.63s	remaining: 48.6s
6:	learn: 0.2539043	total: 7.71s	remaining: 47.4s
7:	learn: 0.2321505	total: 8.8s	remaining: 46.2s
8:	learn: 0.2056101	total: 9.81s	remaining: 44.7s
9:	learn: 0.1858559	total: 10.8s	remaining: 43.4s
10:	learn: 0.1704684	total: 12s	remaining: 42.7s
11:	learn: 0.1570879	total: 13.4s	remaining: 42.3s
12:	learn: 0.1413557	total: 14.5s	remaining: 41.2s
13:	learn: 0.1271780	total: 15.6s	remaining: 40s
14:	learn: 0.1208051	total: 16.6s	remaining: 38.8s
15:	learn: 0.1128903	total: 17.6s	remaining: 37.5s
16:	learn: 0.1058970	total: 18.5s	remaining: 36s
17:	learn: 0.0998455	total: 19.4s	remaining: 34.5s
18:	learn: 0.0957092	total: 20.3s	remaining: 33.1s


Unnamed: 0,id,ac,ev,oven,wash,dryer
0,1,0,1,0,0,1
1,2,0,1,0,0,1
2,3,0,1,0,0,1
3,4,0,1,0,1,1
4,5,0,1,0,0,1
