<a href="https://colab.research.google.com/github/IIF0403/Thesis/blob/main/Dataset_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd 
import torch
from google.colab import files
from google.colab import output
from google.colab import drive
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import random

In [3]:

### Function to load a UCR time_series dataset from my github ###
def load_dataset(Dataset):
  #Dataset: The name of the dataset to load. Example: "ECG5000"

  #github URL
  url_raw = 'https://raw.githubusercontent.com/IIF0403/Thesis/main/data/'
  url_train = url_raw + Dataset+'/'+Dataset+'_TRAIN'
  url_test = url_raw + Dataset+'/'+Dataset+'_TEST'

  #Loading the data
  data_train = pd.read_csv(url_train,header=None)
  data_test = pd.read_csv(url_test, header=None)
  data = pd.concat((data_train, data_test))

  #Want all datasets to have classes as integers starting from 0 
  Y = data.values[:,0]
  classes = len(np.unique(Y))
  Y_transformed = ( (Y-Y.min())/(Y.max()-Y.min()) )*(classes-1)
  data[data.columns[0]] = Y_transformed

  #Inserting the name of the dataset as a column (for later use, when several datasets will be combined)
  data.insert(loc=0, column = "Dataset", value=Dataset) 

  #Inserting the length of the time series T as a column (for later use)
  T = data.shape[1]-2 #The length of the time series
  data.insert(loc=1, column = "T", value = T) 

  return data, classes


### Function to make a big dataset out of several datasets loaded from github ###
def make_big_dataset(Datasets):
  #Datasets: A list of the name of the datasets to load. Example: ["ECG5000", "FordA", "FordB"]

  #Loading first dataset
  data, classes =  load_dataset(Datasets[0])
  used_classes = classes #keeping track of the class-labels already used

  #Loading each of the datasets in the list and combining them all togehter in a big dataset
  for i in range(1, len(Datasets)):
    #loading the i´th dataset
    Dataset = Datasets[i]
    dataset, classes= load_dataset(Dataset)

    #Need to change the class-labels such that all the class-labels of the different datasets differ from eachother
    labels = dataset.values[:,2]
    transformed_labels = labels + used_classes 
    dataset[dataset.columns[2]] = transformed_labels
    
    used_classes += classes #keeping track of the class-labels already used

    data = pd.concat((data, dataset)) #adding the new dataset to the big dataset
  
  return data, used_classes



### Dataset class ###
class Timeseries_Dataset(Dataset):
  def __init__(self, Datasets, Drive=False, Save=None, transform=None):#, classes=None, T_values=None):
    #Datasets: a list with the name of the datasets, can be a list og several or one dataset. If it is several datasets, they will be made into one big dataset
    #Drive: True means loading the already saved dataset from google drive, false means loading data from github
    #Save: name of new dataset, if we want to save the new dataset to google drive
    #transform: a given transformation of the data

    ##Loading the data
    #if the data is saved in Google Drive, load the data from Drive
    if (Drive == True):
      Dataset = Datasets[0]
      print("Loading '",Dataset,"' from Google Drive")
      drive.mount("/content/gdrive")
      data = pd.read_csv('/content/gdrive/My Drive/Datasets/'+Dataset+'.csv')
      classes = len(np.unique(data.values[:,2]))
    
    #else load data from github
    else:
      #If "Datasets" contains several datasets then make a big dataset 
      if (len(Datasets)>1):
        print("Loading and combining ", Datasets," from github")
        data, classes = make_big_dataset(Datasets)
      
      #If "Datasets" only contains one dataset, load the dataset from github
      else:
        Dataset = Datasets[0]
        print("Loading '",Dataset,"' from github")
        data, classes = load_dataset(Dataset)
    
    #Save new dataset to Google drive as 'name' if Save = 'name' and not None
    if (Save!=None): #Save the new dataset to google drive as Save
      print("Saving new dataset to google drive")
      drive.mount("/content/gdrive")
      data.to_csv('/content/gdrive/My Drive/Datasets/'+Save+'.csv', index=False)

    self.dataframe = data
    self.transform = transform
    self.classes = classes
    self.Datasets = Datasets
    
  #defining the len(Dataset) function
  def __len__(self): 
    return len(self.dataframe)

  #defining the _getitem_ function which creates samples, such that when Dataset[i] is called; the i´th sample is returned
  def __getitem__(self, idx): 
    if torch.is_tensor(idx):
      idx = idx.tolist()
    
    #For one sample Dataset[i]:
    dataframe = self.dataframe

    label = dataframe.iloc[idx, 2] #retrieveing the label
    dataset = dataframe.iloc[idx,0] #retrieveing the dataset-name
    T = dataframe.iloc[idx,1] #retrieveing the timeseries-length
    time_series_with_nan = dataframe.iloc[idx,3:].to_numpy() #retrieveing the timeseries (containing nan-values at the end)
    time_series = time_series_with_nan[:T] #Removing nan_values at the end

    sample = {'time_series': time_series, 'label': label, 'dataset': dataset, "T": T} #a sample is one timeseries with it's corresponding label (and som xtra information)

    if self.transform: #transform sample
      sample = self.transform(sample)

    return sample
  
  def info(self):#Function to print information about the dataset
    print("Datasets included: ", self.Datasets)
    print("Number of classes : ", self.classes)
    print("Size of dataset: ", len(self))


In [8]:

### Transformation class; segment timeseries into two augmentations
class TwoSegments(object):
  def __init__(self, horizon=0.3, window_gap=1, random_startpos = False, random_horizon=False, random_window_gap=False):
    #horizon: horizon*T = window_length; the length of the two augmentations
    #window_gap: the gap bewteen the two augmentations
    #random_startpos: True means that the first augmentation starts at a random position
    #random_horizon: True means that a random horizon is chosen
    #random_window_gap: True means that a random window_gap is chosen

    self.horizon = horizon
    self.window_gap = window_gap
    self.random_startpos = random_startpos
    self.random_horizon = random_horizon
    self.random_window_gap = random_window_gap
        
  def __call__(self, sample):

    dataset = sample['dataset']
    time_series = sample['time_series']
    T = sample['T']
    label = sample['label']

    #horizon
    if (self.random_horizon==True):
      possible_horizons = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
      horizon = random.choice(possible_horizons) #draw a random horizon
    else:
      horizon = self.horizon

    #window gap
    if (self.random_window_gap==True):
      possible_window_gaps =[0,1,2,3,4,5,6,7,8,9,10]
      window_gap = random.choice(possible_window_gaps) #draw random window_gap
    else:
      window_gap = self.window_gap

    window_length = int(horizon*T) #length of each augmentation

    #finding start-position of the first augmentation
    if (self.random_startpos == True): #if random start_position
      max_possible_startposition = T-(2*window_length+window_gap) #the maximal start-position of the first augmentation
      possible_startpossisions = [i for i in range(max_possible_startposition_aug1)] #The possible start positions of the first augmentation
      start_pos = random.choice(possible_startpossisions) #draw a random startposition
    else:
      start_pos = 0 

    #make the two augmentations of the timeseries
    augmentation_1 = time_series[start_pos : (start_pos+window_length)]
    augmentation_2 = time_series[(start_pos+window_length+window_gap) : (start_pos+window_length+window_gap)+window_length]

    #create a new sample with the two augmentations
    new_sample = {'aug1': augmentation_1, 'aug2': augmentation_2, 'label': label, 'dataset': dataset, "T": T}

    return new_sample

### Transformation class; convert into Tensor-data for PyTorch-use
class ToTensor(object):
  def __call__(self, sample):
    dataset = sample['dataset']
    T = sample['T']
    label = sample['label']

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if (len(sample)== 5):
      aug1 = sample['aug1'].astype(float)
      aug2 = sample['aug2'].astype(float)
      label = label.astype(float)

      aug1_tensor = torch.tensor(aug1, dtype=torch.float32, device=device)
      aug2_tensor = torch.tensor(aug2, dtype=torch.float32, device=device)
      label_tensor = torch.tensor(label, dtype=torch.long, device=device)

      torch_sample = {'aug1': aug1_tensor, 'aug2': aug2_tensor, 'label': label_tensor, 'dataset': dataset, "T": T}

    else:
      time_series = sample['timeseries'].astype(float)
      label = label.astype(float)

      time_series_tensor = torch.tensor(time_series, dtype=torch.float32, device=device)
      label_tensor = torch.tensor(label, dtype=torch.long, device=device)

      torch_sample = {'time_series': time_series_tensor, 'label': label_tensor, 'dataset': dataset, "T": T}

    return torch_sample




In [9]:
##Testing the Dataset class with one dataset and no transformation
Datdat1 = ["ChlorineConcentration"]
test_1_dataset = Timeseries_Dataset(Datdat1)
test_1_dataset.info()

##Testing the Dataset class with two datasets and no transformation
Datdat2 = ["ChlorineConcentration", "ECG5000"]
test_2_dataset = Timeseries_Dataset(Datdat2)
test_2_dataset.info()



Loading ' ChlorineConcentration ' from github
Datasets included:  ['ChlorineConcentration']
Number of classes :  3
Size of dataset:  4307
Loading and combining  ['ChlorineConcentration', 'ECG5000']  from github
Datasets included:  ['ChlorineConcentration', 'ECG5000']
Number of classes :  8
Size of dataset:  9307


In [10]:
##Testing dataset class with many Datasets to make a big dataset and save it in google drive as "Big_dataset" for later use
#Datasets_UCR = ["ChlorineConcentration", "ECG5000", "ElectricDevices", "FordA", "FordB", "NonInvasiveFatalECG_Thorax1", "PhalangesOutlinesCorrect", "Two_Patterns", "uWaveGestureLibrary_X", "yoga"]
#Big_dataset = Timeseries_Dataset(Datasets_UCR, Save="Big_dataset") 


##Testing the dataset class with the big dataset already saved in google drive as "Big_dataset"
Dataset_big = ["Big_dataset"]
Big_dataset = Timeseries_Dataset(Dataset_big, Drive=True)
Big_dataset.info()


Loading ' Big_dataset ' from Google Drive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Datasets included:  ['Big_dataset']
Number of classes :  77
Size of dataset:  54512


In [11]:
#Cecking a random sample from the big dataset
sample = Big_dataset[5000]

print("Dataset: ", sample['dataset'])
print("label: ", sample['label'])
print("time_series", sample['time_series'] )
print("length", len(sample['time_series'])," = ", sample['T'])


Dataset:  ECG5000
label:  3.0
time_series [-0.46884 -1.7882 -3.1189 -3.9938 -4.4052 -4.0492 -3.442 -2.3566 -1.3525
 -1.1544 -0.62167 -0.16988 -0.25256 -0.23814 -0.21724 -0.24842 -0.31426
 -0.25116 -0.28998 -0.20823000000000005 -0.30376 -0.28241 -0.17132 -0.2781
 -0.39058 -0.4749 -0.34214 -0.25839 -0.2532 -0.16116 -0.037112 -0.2593
 -0.27182 -0.21782 -0.24647 -0.24586 -0.36634 -0.18614 -0.12941 -0.17467
 -0.21508 -0.052802 -0.065706 -0.090324 -0.0058875 -0.030413 0.055523
 0.090912 0.19572 0.14973 -0.018653 0.14499 0.093002 0.30346 0.21618
 0.15542 0.06979099999999999 0.13327 0.1438 0.025846 0.092808 -0.04918
 -0.066777 0.0066759 0.043886 0.095751 -0.025595 0.055351 0.14149 0.16483
 0.20983 0.25157 0.1959 0.47397 0.51012 0.32099 0.43829 0.5027
 0.4114600000000001 0.42173 0.48473 0.35172 0.30354000000000003 0.37531
 0.50374 0.46707 0.38394 0.3919 0.37201 0.5624600000000001 0.5508 0.38089
 0.34915 0.43562 0.5854 0.86035 0.89151 1.0449 1.2377 1.4923 1.8545 2.2274
 2.3473 2.1307 1.9825 1.66

In [14]:
## Testing dataset with transformation

##Testing the Dataset class with one dataset with transformation
Datdat1 = ["ChlorineConcentration"]

dat1 = Timeseries_Dataset(Datdat1) #dataset not transformed

tens_dat1 = Timeseries_Dataset(Datdat1, transform = transforms.Compose( [TwoSegments(), ToTensor()] ) ) #dataset transformed

sample = dat1[0]
sample_trans = trans_dat1[0]
sample_tens = tens_dat1[0]

print("time series: ", sample['time_series'])
print("aug1: ", sample_tens['aug1'])
print("aug2: ", sample_tens['aug2'])





Loading ' ChlorineConcentration ' from github
Loading ' ChlorineConcentration ' from github
time series:  [2.6173 3.2310000000000003 2.8508 2.7515 2.3457 2.2746 1.9898 1.849 1.4533
 1.3171 1.1623 0.9791 0.76748 -0.27063000000000004 1.4944 1.4457
 1.2830000000000001 1.1554 0.73141 0.53898 0.26045 0.058329 -0.30429
 -0.60378 -0.92625 -1.0126 1.8559 1.5001 1.3893 0.9980899999999999 0.68913
 0.35189000000000004 0.078764 -0.41918 -0.74724 -1.0089 -1.0009 -1.0751
 0.9516600000000001 1.4072 0.83868 0.91775 -0.18871 -0.76323 -0.57789
 -1.0485 -0.68799 -0.94221 -1.0463 -1.0259 0.41773999999999994 1.3372
 0.9633 0.65598 -0.14524 -0.58696 -0.87084 -0.89159 -1.0501 -1.0585
 -1.0934 -1.015 -1.1281 1.093 0.71907 0.46547 0.030246 -0.3837
 -0.6169399999999999 -0.7783100000000001 -0.88265 -1.0328 -0.9803 0.22255
 0.029094 -0.056766 -0.26168 -1.0202 0.5554600000000001 0.013634
 -0.44581000000000004 -0.80943 1.3638 1.2515 0.8237200000000001 0.8472
 0.43723999999999996 0.093769 -0.39741 -0.522 -0.69016 -1

In [None]:
#Testing DataLoader
data_loader1 = DataLoader(tens_dat1, batch_size=4, shuffle=False, num_workers =0)


In [None]:
for samples in enumerate(data_loader1):
  print(samples[1]['aug1'].size())


In [None]:

#### Overføre laget dataset til drive
#drive.mount("/content/gdrive")
#big_dataset.to_csv('/content/gdrive/My Drive/Datasets/big_dataset.csv', index=False)

#### Laste ned csv fil
#files.download("big_dataset.csv")

#### Åpne dataset fra drive
#drive.mount("/content/gdrive")
#the_data = pd.read_csv('/content/gdrive/My Drive/Datasets/big_dataset.csv')

