In [13]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import os
import random
import tqdm
from typing import Union

import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 12]

InteractiveShell.ast_node_interactivity = "all"

In [2]:
!cd C:\Users\Josef\Google Drive\Uni\Master\3 Wintersemester 20-21\Seminar Information Systems\Contribution

In [3]:
AMPds_PATH = "C:/Users/Josef/Google Drive/Uni/Master/3 Wintersemester 20-21/Seminar Information Systems/Contribution/data/AMPds/"

In [233]:
amp = pd.read_csv(AMPds_PATH + "Electricity_P.csv")
weather = pd.read_csv(AMPds_PATH + "Climate_HourlyWeather.csv")

amp.set_index(pd.to_datetime(amp.UNIX_TS, unit="s"), inplace=True)

In [290]:
class DataStreamerNilm:
    """Returns batches of a given dataset.

    Takes a given dataset, optionally enriches it with additional data and 
    returns an iterator over that dataset with the given batch size. Note that
    this function applies no preprocessing, so the input data needs to be 
    processed beforehand.
    """

    def __init__(
        self,
        dataset,
        mains_col: str,
        appliance_cols: Union[str, list],
        batch_size: int = 8192,
        window_size: int = 1,
        shuffle: bool = False,
        chunksize: int = -1,
        random_state: int = None,
        validation_size: float = 0.2
    ):
        """Initialize NILM data streamer.

            Args:
            dataset: pd.DataFrame of mains and appliance data.
              TODO: Load file from disk.
            mains_col: Name of the columns containing the mains readings.
            appliance_col: Either single name or list of appliance names to 
              return.
            batch_size: Number of datapoints returned.
            window_size: In case sequential training data is needed, each 
              batch item consists of a time window with given length. Leave at 
              1 to return independent singular observations.
            shuffle: Shuffle data before yielding. If window length is given,
              the data is first split into window-sized continuous chunks and
              then shuffled to preserve order.
              TODO: How to handle this with window batches.
            chunksize: Currently not implemented. Number of observations to 
              load from disk.
              TODO: If file is loaded from memory, enable chunkwise loading.
            random_state: Use to get reproducable shuffling results.
            validation_size: percentage of the data to be used for validation

        Yields:
            An iterable over the input dataset.
        """

        self.mains_col = mains_col
        self.appliance_cols = appliance_cols
        self.batch_size = batch_size
        self.window_size = window_size
        self.shuffle = shuffle
        self.chunksize = chunksize
        self.random_state = random_state
        if self.random_state:
            np.random.seed(self.random_state)
        
        
        # We only need to keep mains + selected appliances in memory
        if type(appliance_cols) is str:
            self.dataset = dataset.filter([mains_col, appliance_cols])
        else:
            self.dataset = dataset.filter([mains_col] + appliance_cols)
        
        self.reset_iterator(self.dataset)
        
    def generate_batch(self):
        target, features = next(self.dataset_iterator)
        return target, features

    def _dataset_to_list(self, data: pd.DataFrame, mains_col: str):
        # Steps:
        # 1 Split into rows or chunks
        # 2 Create numpy arrays with format
        # batch_size x window_length x appliances/data

        mains = data[mains_col].values
        appliances = data.drop(columns=[mains_col])

    def _dataset_iterator(self, data:list):
        """
        Yields batches of data. Expects list of batches, each containing two
        arrays, one with mains data and one with corresponding features.
        """
        for batch in data:
            yield batch


    def reset_iterator(self, data: pd.DataFrame) -> None:
        """Reset data streamer and empty sample cache"""
        df_length_original, n_cols = data.shape

        if self.window_size > 1:
            # A bit hacky, but to make the reshape work we cut off a small part
            # at the end so the dataset nicely divides into window_sized parts
            cutoff = df_length_original % self.window_size
            if cutoff > 0:
                data = data[:-cutoff]
        df_length = data.shape[0]
        n_splits = df_length // self.window_size

        # Reshape the data into window_sized parts
        data = data.to_numpy().reshape((n_splits, self.window_size, n_cols))

        if self.shuffle:
            np.random.shuffle(data)
        
        # There might be a better way to make sure the data exactly divides into
        # the given amount of batches, but probably not an issue with sufficient
        # training samples.
        batch_cutoff = n_splits % self.batch_size
        if batch_cutoff > 0:
            data = data[:-batch_cutoff]
        
        # Now separate the shuffled and windowed observations into target and
        # feature lists.
        # TODO: Maybe this step can be done before and both lists can instead
        # be shuffled separately with same seeds.
        target_list = []
        feature_list = []
        for window in data:
            target, features = np.hsplit(window,[1])
            target_list.append(target)
            feature_list.append(features)
        
        # Finally split the data into batches, consisting of a list of target
        # windows and a list of corresponding feature windows.
        n_batches = len(target_list) // self.batch_size
        batches = []
        
        # TODO: Create batch-indexes in a nicer way
        for i in range(n_batches):
            batches.append([target_list[i*self.batch_size:i*self.batch_size+self.batch_size],
                            feature_list[i*self.batch_size:i*self.batch_size+self.batch_size]])
        
        self.dataset_iterator = self._dataset_iterator(batches)      

In [291]:
ts = DataStreamerNilm(
    dataset = amp,
    mains_col = "MHE",
    appliance_cols = ["FGE", "UNE"],
    shuffle=False,
    window_size=8,
    batch_size=2)

In [None]:
class Seq2Point(nn.Module):
    def __init__(self):
        super(Seq2Point, self).__init__()
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Conv2d(in_channels=1, 
                                     out_channels=30, 
                                     stride=(1, 1), 
                                     kernel_size=(10, 1)))

In [307]:
import torch.nn as nn

In [None]:
class Trainer:
    
    def __init__(self, 
                 dataloader, 
                 device="cpu",
                 validation_split=0.2):
        
        self.device = device
        self.dataloader = dataloader

        # Initialize input and output embeddings (w_in and w_out)
        self.center_model = nn.Embedding(vocab_size, embedding_dim).to(self.device)
        self.context_model = nn.Embedding(vocab_size, embedding_dim).to(self.device)
        
        # Pass iterable of both center and context embeddings to optimizer
        self.optim = torch.optim.Adam(params=chain(self.center_model.parameters(), self.context_model.parameters()),
                                      lr=0.001,
                                      betas=(0.9, 0.999),
                                      eps=1e-08) 
        
        self.loss = SkipGramLoss()

    
    def train(self, epochs=5):
        
        self.epochs=epochs
        
        print(f"Training for {self.epochs} epochs started at {strftime('%H:%M:%S', localtime())}")

        for epoch in range(self.epochs):
            
            # Reset iterator (as the loader runs through its basket iterator completely during each epoch)
            self.dataloader.reset_iterator()
            
            # Used for outputting running statistics
            running_loss = 0.0
            batch = 0
            
            while True:
                try:
                    # Reset gradient
                    self.optim.zero_grad()

                    # Generate a batch and convert indices to tensor (needed for lookup in embedding layer)
                    center, context, negative_samples = self.dataloader.generate_batch()
                    center, context, negative_samples = torch.tensor(center), torch.tensor(context), torch.tensor(negative_samples)
                    center, context, negative_samples = center.to(self.device), context.to(self.device), negative_samples.to(self.device)
                    
                    # Get embeddings from embedding_layers
                    center_embed = self.center_model(center).to(self.device)
                    context_embed = self.context_model(context).to(self.device)
                    negative_samples_embed = self.context_model(negative_samples.long()).to(self.device)
                    
                    # Calculate loss
                    loss = self.loss(center_embed, context_embed, negative_samples_embed)
                    
                    # Backprop
                    loss.backward()
                    
                    # Make a step
                    self.optim.step()
                    
                    #Save running loss and batch count for output
                    running_loss += loss.item()
                    batch += 1
                    
                except Exception as e:
                    # Perhaps not the cleanest way to do it, but once the dataloader iterated through all baskets an
                    # error will be thrown which ends the epoch training loop
                    break
                    
            print(f"Finished epoch {epoch+1} at {strftime('%H:%M:%S', localtime())}\t Loss: {running_loss/batch:9.6f}")

trainer = Trainer(dataloader=ts, 
                  device="cuda",
                  validation_split=0.2)

trainer.train(epochs=10)