<a href="https://colab.research.google.com/github/JacopoBartoli/vas_regression/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1) Install packages and organize imports.

In this section we install and import the needed packages. Then we mount our GDrive

In [None]:
!pip install tensorflow-addons

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorboard
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import datetime
import math
import sklearn.preprocessing
import copy

Useful paths.

In [None]:
# Path to the datasets.
DATASET_DIR = '/content/gdrive/My Drive/IVA/data/'

Mount the drive.

In [None]:
# Mount your drive to access the dataset.
# Remember to link the dataset as explained above.
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#2) Manage the train set.
In this section we manipulate and extract the data.

##2.1) Load the train set.
Define the name of the dataset used for 
training.


In [None]:
# Name of the dataset used.
UNSAMPLED_NAME = 'train-velocity-662.csv'

Load the train set from a .csv file.

In [None]:
df = pd.read_csv(DATASET_DIR + UNSAMPLED_NAME)
print(df.head())

## 2.2) Extract the labels from the dataset.

First of all we scaled the elements of the dataset. The label were scaled too, they went from the space [0,10] to [0,1].

In [None]:
# Define the scaler.
# For the label is used a min max scaler.
# For the other parameters is used an Standard scaler.
min_max_scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))
std_scaler = sklearn.preprocessing.StandardScaler()

# Scale the data.
print(df.columns)
print(df.head())
print(list(df))
feature_to_scale = list(df.columns)
feature_to_scale.remove('Sequenza')
feature_to_scale.remove('Frame')
feature_to_scale.remove('Label')
std_scaler.fit(df[feature_to_scale])
df[feature_to_scale] = std_scaler.transform(df[feature_to_scale])

#Scale the labels.
df[['Label']] = min_max_scaler.fit_transform(df[['Label']])

Index(['Sequenza', 'Frame', 'Vel0', 'Vel1', 'Vel2', 'Vel3', 'Vel4', 'Vel5',
       'Vel6', 'Vel7', 'Vel8', 'Vel9', 'Vel10', 'Vel11', 'Vel12', 'Vel13',
       'Vel14', 'Vel15', 'Vel16', 'Vel17', 'Vel18', 'Vel19', 'Vel20', 'Vel21',
       'Vel22', 'Vel23', 'Vel24', 'Vel25', 'Vel26', 'Vel27', 'Vel28', 'Vel29',
       'Vel30', 'Vel31', 'Vel32', 'Vel33', 'Vel34', 'Vel35', 'Vel36', 'Vel37',
       'Vel38', 'Vel39', 'Vel40', 'Vel41', 'Vel42', 'Vel43', 'Vel44', 'Vel45',
       'Vel46', 'Vel47', 'Vel48', 'Vel49', 'Vel50', 'Vel51', 'Vel52', 'Vel53',
       'Vel54', 'Vel55', 'Vel56', 'Vel57', 'Vel58', 'Vel59', 'Vel60', 'Vel61',
       'Vel62', 'Vel63', 'Vel64', 'Vel65', 'Label'],
      dtype='object')
   Sequenza  Frame      Vel0      Vel1  ...     Vel63     Vel64     Vel65  Label
0         0      0  0.070404  0.156512  ...  0.130682  0.264413  0.229226      0
1         0      1  0.125513  0.114344  ...  0.120285  0.138199  0.126977      0
2         0      2  0.253132  0.231857  ...  0.081278  0.

In [None]:
print(df.head())

   Sequenza  Frame      Vel0      Vel1  ...     Vel63     Vel64     Vel65  Label
0         0      0 -0.560851 -0.318372  ... -0.067525  0.503878  0.438662    0.0
1         0      1 -0.426654 -0.428096  ... -0.117698 -0.059046 -0.070476    0.0
2         0      2 -0.115885 -0.122321  ... -0.305934 -0.180287 -0.182083    0.0
3         0      3 -0.275672 -0.271911  ... -0.401472 -0.364537 -0.397344    0.0
4         0      4 -0.173521 -0.211368  ... -0.451282 -0.463410 -0.472155    0.0

[5 rows x 69 columns]


## 2.3) Oversampling or Downsampling.

Perform the oversampling and downsampling operations.


In [None]:
# Define some constant needed for the samplig operation.
chunk_dim = 230
tail_exclusion_percentage = 10

num_columns = len(df.columns)
# Remove from the count the label column and the frame and sequence indexes.
num_features = num_columns - 3

We extract the id of each sequence.

In [None]:
# Extract the sequences number.
sequences_number = df['Sequenza'].tolist()
sequences_number = list(dict.fromkeys(sequences_number))

Define the utilities functions needed to perform the downsampling and the upsampling operations.

In [None]:
def compute_downsampling(chunk_dim, sequence, padding_item):
  seq_length = len(sequence)
  change = seq_length % chunk_dim
  step = math.floor(seq_length / chunk_dim)
  downsampled = [[] for _dummy in range(step)]
  if change == 0:
    for i in range(step):
      for j in range(chunk_dim):
        downsampled[i] = np.append(downsampled[i], sequence.iloc[i + j * step])
  else:
    if change < chunk_dim/2:
      # Remove the exceeding elements if they are too few.
      offset = change
      for i in range(step):
        for j in range(chunk_dim):
          downsampled[i].append(sequence.iloc[change + i + j * step])
    else:
      downsampled.append([])
      padding = [padding_item for _dummy in range(chunk_dim - change)]
      padded_sequence = pd.concat([pd.DataFrame(padding), sequence], ignore_index = True)
      for i in range(step + 1 ):
        for j in range(chunk_dim):
          downsampled[i].append(padded_sequence.iloc[i + j * step])

  return downsampled

def compute_oversampling(chunk_dim, sequence, padding_item, excluded_percentage):
   seq_length = len(sequence)
   # Remove the head and the tail of the sequence. Those parts will be excluded
   # in the oversampling operation.
   excluded_item = math.floor(seq_length / excluded_percentage)
   sampling_factor = math.floor(
       (chunk_dim - 2 * excluded_item) / (seq_length - 2 * excluded_item))
   change = (chunk_dim - 2 * excluded_item) % (seq_length - 2 * excluded_item)

   oversampled = []
   for i in range(excluded_item):
     oversampled.append(sequence.iloc[i])
   for i in range(change):
     oversampled.append(padding_item)
   for i in range(excluded_item, seq_length - (excluded_item)):
     for j in range(sampling_factor):
       oversampled.append(sequence.iloc[excluded_item + i])
   for i in range(excluded_item):
     oversampled.append(sequence.iloc[seq_length - (excluded_item) + i])

   return oversampled

def create_padding_item(sequence, num_features):
  result = []
  result.append(sequence.iloc[0]['Sequenza'])
  result.append(0)
  for i in range(num_features):
    result.append(0)
  result.append(sequence.iloc[0]['Label'])

  return pd.Series(result, index=sequence.columns)

Now we iterate over each sequence id, and at each sequence we perform oversampling or undersampling operations.

In [None]:
# Each element of data is a sampled sequence of frames.
data = []
new_seq_number = 0
for id in tqdm(sequences_number):
  sequence = df.loc[df['Sequenza'] == id]
  padding_item = create_padding_item(sequence, num_features)
  if (len(sequence) > chunk_dim):
    sampled = copy.deepcopy(compute_downsampling(chunk_dim, sequence, padding_item))
    for i in range(len(sampled)):
      for j in range(chunk_dim):
        sampled[i][j][0] = new_seq_number
      new_seq_number +=1
    for seq in sampled:
      for item in seq:
        data.append(item)
    
  elif (len(sequence) < chunk_dim):
    sampled = copy.deepcopy(compute_oversampling(chunk_dim, sequence, padding_item, tail_exclusion_percentage))
    for i in range(len(sampled)):
      sampled[i][0] = new_seq_number
    new_seq_number +=1
    for item in sampled:
      data.append(item)

In [None]:
# They are pandas Series so we need to concatenate.
sampled_df = pd.concat(data, axis=1).T

In [None]:
print(sampled_df.tail())

In [None]:
SAMPLED_NAME = UNSAMPLED_NAME.replace('.csv','-sampled.csv')
sampled_df.to_csv(DATASET_DIR + SAMPLED_NAME, index = False)

#3) Manage the test set.
In this section we manipulate and extract the data.

##3.1) Load the test set.
Define the name of the dataset used for 
test.


In [None]:
# Name of the dataset used.
UNSAMPLED_NAME = 'test-velocity-66.csv'

Load the test set from a .csv file.

In [None]:
df = pd.read_csv(DATASET_DIR + UNSAMPLED_NAME)
print(df.head())

## 3.2) Extract the labels from the dataset.

In this section we scaled the data too. The data now are scaled using the StandardScaler() fitted on the train data.

In [None]:
# Define the scaler.
# Scale the data.
feature_to_scale = list(df.columns)
feature_to_scale.remove('Sequenza')
feature_to_scale.remove('Frame')
feature_to_scale.remove('Label')
df[feature_to_scale] = std_scaler.transform(df[feature_to_scale])

# Scale the labels.
df[['Label']] = min_max_scaler.fit_transform(df[['Label']])

In [None]:
print(df.head())

## 3.3) Oversampling or Downsampling.

Perform oversampling and downsampling operations.


In [None]:
# Define some constant needed for the samplig operation.
chunk_dim = 230
# Percentage of samples exluded from oversamplig on the head and the tail of the
# sequences.
tail_exclusion_percentage = 10

num_columns = len(df.columns)
# Remove from the count the label column and the frame and sequence indexes.
num_features = num_columns - 3

We extract the id of each sequence.

In [None]:
# Extract the sequences number.
sequences_number = df['Sequenza'].tolist()
sequences_number = list(dict.fromkeys(sequences_number))

We use the utilities functions defined above to perform the operation of downsampling and upsampling operations.

In [None]:
# Each element of data is a sampled sequence of frames.
data = []
new_seq_number = 0
df_copied = df.copy()
for id in tqdm(sequences_number):
  sequence = df_copied.loc[df_copied.Sequenza == id].copy()
  padding_item = create_padding_item(sequence, num_features)
  if (len(sequence) > chunk_dim):
    sampled = copy.deepcopy(compute_downsampling(chunk_dim, sequence, padding_item))
    for i in range(len(sampled)):
      for j in range(chunk_dim):
        sampled[i][j][0] = new_seq_number
      new_seq_number +=1
    for seq in sampled:
      for item in seq:
        data.append(item)
    
  elif (len(sequence) < chunk_dim):
    sampled = copy.deepcopy(compute_oversampling(chunk_dim, sequence, padding_item, tail_exclusion_percentage))
    for i in range(len(sampled)):
      sampled[i][0] = new_seq_number
    new_seq_number +=1
    for item in sampled:
      data.append(item)

In [None]:
# They are pandas Series so we need to concatenate.
sampled_df = pd.concat(data, axis=1).T

In [None]:
print(sampled_df.tail())

In [None]:
SAMPLED_NAME = UNSAMPLED_NAME.replace('.csv','-sampled.csv')
sampled_df.to_csv(DATASET_DIR + SAMPLED_NAME, index = False)