In [3]:

## classic pydata stack
import os 
import numpy as np
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

%matplotlib inline 

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15,7)



## torch
import torch
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## SEEDING

torch.manual_seed(1)


REBUILD_DATA = True

In [4]:
arr04 = np.load("AA00400AA.npy",allow_pickle=True)
arr64 = np.load("AA66466AA.npy",allow_pickle=True)

In [14]:
arr64[0]

array([[ 0.08    , 62.162514],
       [ 0.09    , 56.76618 ],
       [ 0.1     , 51.997746],
       [ 0.11    , 48.05742 ],
       [ 0.12    , 44.999798],
       [ 0.13    , 42.879486],
       [ 0.14    , 41.623676],
       [ 0.15    , 41.095875],
       [ 0.16    , 41.095875],
       [ 0.17    , 41.405277],
       [ 0.18    , 41.78748 ],
       [ 0.19    , 42.02408 ],
       [ 0.2     , 41.91488 ],
       [ 0.21    , 41.368877],
       [ 0.22    , 40.30417 ],
       [ 0.23    , 38.76626 ],
       [ 0.24    , 36.809746],
       [ 0.25    , 34.57113 ],
       [ 0.26    , 32.196014],
       [ 0.27    , 29.8664  ],
       [ 0.28    , 27.727884],
       [ 0.29    , 25.926073],
       [ 0.3     , 24.533764],
       [ 0.31    , 23.596457],
       [ 0.32    , 23.132355],
       [ 0.33    , 23.068653],
       [ 0.34    , 23.359856],
       [ 0.35    , 23.905859],
       [ 0.36    , 24.615664],
       [ 0.37    , 25.39827 ],
       [ 0.38    , 26.171774],
       [ 0.39    , 26.899778],
       [

In [15]:
arr64[0].shape

(463, 2)

In [16]:
obs = arr04[0]
time = obs[:,0]
current = obs[:,1]

In [9]:
res = [[1,2], [3,4]]
[val for sublist in res for val in sublist]


[1, 2, 3, 4]

In [20]:
res = np.array([])
res = np.concatenate((res,np.array([1,2])),axis=None)
np.concatenate((res,np.array([1,2])),axis=0)

array([1., 2., 1., 2.])

In [30]:
divmod(7,5)

(1, 2)

In [None]:
divmod()

In [34]:
def extract_features(instance, num_blocks):

        res = np.array([])
        length = len(instance)
        # divide the length by num_blocks to get block_size
        block_size, remainder  = divmod(length,num_blocks)


        # iterating over each block and extracting features
        for i in range(num_blocks):

            curr = instance[block_size*i: block_size*(i+1)]
            features = curr
            res = np.concatenate((res, features),axis=None)


        ## get the remainder of the time series
        curr = instance[block_size*num_blocks:]
        features = curr
        res = np.concatenate((res,features),axis=None)

        return res

In [35]:
test = np.array([1,2,3,4,5,6,7])

extract_features(test,5)

array([1., 2., 3., 4., 5., 6., 7.])

In [61]:
list = np.array([1,2,3,4,4000])
func = [np.mean,np.median,np.std,len]

for f in func:
    print(f(list))

802.0
3.0
1599.000312695404
5


In [62]:
input = Input(list,2,0)
input.input

tensor([1.5000, 1.5000, 0.5000, 1.0000, 2.0000, 2.0000, 3.5000, 3.5000, 0.5000,
        3.0000, 4.0000, 2.0000])

In [59]:
class Input():
    def __init__(self, raw_series,num_blocks,label):
        """ Initilaizes an input object from a raw time series i.e. an input suitable to feed to a recurrent neural network

        Args:
            raw_series (numpy array of shape (num_timesteps,2)): raw time series from npy data i.e. arr[0] where arr = np.load("data.npy")
            num_blocks ([type]): number of "feature blocks" into which the time series will be sliced i.e the number of of times we need to feed 
            to the LSTM to train on the entire time series
            label ([type]): Whether it was a "00" backbone (label:0) or a "66" backbone (label:1)
        """

        self.label = label
        self.input = self.process(raw_series,num_blocks)


    def process(self,raw_series,num_blocks):
        """ Function that does the entire processing of going from raw time series to a suitable input to feed to a recurrent neural network

        Args:
            raw_series (numpy array of shape (num_timesteps,2)): raw time series from npy data i.e. arr[0] where arr = np.load("data.npy")
            num_blocks ([type]): number of "feature blocks" into which the time series will be sliced i.e the number of of times we need to feed 
            to the LSTM to train on the entire time series

        Returns:
            np.ndarray: array of features from a single raw time series instance
        """


        # stores the processed time series
        res = np.array([])

        ## returns a list of transformed time series (current list: normal. lowpass filtered, highpass filtered)
        instances = self.transform(raw_series)


        for instance in instances:
            ## chunks an instance of a time series into blocks and extract feature from each block
            extracted = self.extract_features(instance,num_blocks)
            res = np.concatenate((res,extracted),axis=None)

        return torch.Tensor(res)


    def transform(self,raw_series):
        """ Given a raw time series, outputs several transformations applied to it
            Transformations may be filtering, projecting, ...

        Args:
            raw_series numpy.ndarray : 1 dimensional array representing the current values

        Returns:
            List(numpy.ndarray): list of all transformations
        """

        res = [raw_series]

        return res

    def extract_features(self,instance, num_blocks):

        """ From a time series, divides it into num_blocks blocks and from each block, extract numerical features usable for a neural network

        Args:
            instance (numpy.ndarray): 1D array containing numerical values
            num_blocks (int): number of "feature blocks" into which the time series will be sliced i.e the number of of times we need to feed 
            to the LSTM to train on the entire time series


        Returns:
            numpy.ndarray: 1D array of length num_blocks*num_features_per_block containing all the features from a time series
        """

        res = np.array([])
        length = len(instance)
        # divide the length by num_blocks to get block_size
        block_size, remainder  = divmod(length,num_blocks)


        # iterating over each block and extracting features
        for i in range(num_blocks):

            curr = instance[block_size*i: block_size*(i+1)]
            # get features from block (mean, std, length, ...)
            features = self.features(curr)
            res = np.concatenate((res, features),axis=None)


        ## get the remainder of the time series
        ##curr = instance[block_size*num_blocks:]
        ##features = self.features(curr)
        ##res = np.concatenate((res,features),axis=None)

        return res

    def features(self,instance):
        """
        From a block of a time series, extracts numerical features usable for a neural network
        Args:
            instance (numpy.ndarray): 1D array containing numerical values 
        """
        res = np.array([])

        # list of functions applied to the array for feature extraction
        functions = [np.mean,np.median,np.std,np.min,np.max,len]

        for func in functions:
            res = np.concatenate((res,func(instance)),axis=None)

        

        return res        

In [None]:
from torch.utils.data import Dataset

class PolymerDataset(Dataset):
    def __init__(self, data_paths,num_blocks):
        super().__init__()
        self.process(data_paths,num_blocks)

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


    def process(data_paths, num_blocks):
        raw_data = [np.load(data_path, allow_pickle=True) for data_path in data_paths]
    
    ## balance the dataset by removing signals that are too short or too long

In [63]:
for arr in arr04[:10]:
    input = Input(arr,3,0)
    print(input.input)

tensor([1.7267e+01, 7.9033e+00, 1.8510e+01, 1.0000e-02, 9.8117e+01, 3.2900e+02,
        2.1010e+01, 1.1798e+01, 1.7708e+01, 3.3000e+00, 5.3896e+01, 3.2900e+02,
        2.2852e+01, 1.1092e+01, 1.7905e+01, 6.5900e+00, 9.9822e+01, 3.2900e+02])
tensor([2.3154e+01, 1.2368e+01, 2.8816e+01, 1.0000e-02, 1.0380e+02, 5.2000e+01,
        1.5590e+01, 1.0491e+01, 1.6001e+01, 5.3000e-01, 4.5435e+01, 5.2000e+01,
        1.9771e+01, 7.3575e+00, 2.5505e+01, 1.0500e+00, 1.0255e+02, 5.2000e+01])
tensor([1.8614e+01, 8.1489e+00, 1.7053e+01, 1.0000e-01, 5.5625e+01, 6.4700e+02,
        2.1664e+01, 1.2965e+01, 1.3983e+01, 6.5700e+00, 5.4387e+01, 6.4700e+02,
        2.5568e+01, 1.8845e+01, 1.2416e+01, 1.0625e+01, 5.7936e+01, 6.4700e+02])
tensor([4.4198e+01, 4.1954e+01, 4.4227e+01, 1.0000e-02, 9.5292e+01, 1.2000e+01,
        4.5637e+01, 4.2562e+01, 4.5517e+01, 1.3000e-01, 9.4654e+01, 1.2000e+01,
        4.8245e+01, 4.7347e+01, 4.7958e+01, 2.5000e-01, 9.9557e+01, 1.2000e+01])
tensor([1.7423e+01, 5.7177e+00, 1.70

In [6]:
class LSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim):
        super.__init__()
        self.lstm = nn.LSTM(input_dim,hidden_dim)
        self.fc1 =nn.Linear(hidden_dim, 2)

    
    def forward(self, input):
        """ Forward pass of our network

        Args:
            input ([type]): should be our current time series preprocessed with shape(num_blocks, num_features) 
            where num_blocks is the number of blocks in which we have divided our time series and  num_features is the number of feature per block
        """
        num_blocks=input.shape[0]
        ## need to shape the data in 3D tensor for lstm
        lstm_out, _ = self.lstm(input.view(num_blocks,1,-1))
        ## passing through MLP and softmax
        last = self.fc1(lstm_out.view(num_blocks,-1))
        scores = F.log_softmax(last,dim=1)

        return scores 
