# Connect To Drive

# Import Libraries

In [None]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [None]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.15.0


In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm


# Load Data

In [None]:
train_data = np.load('/gdrive/MyDrive/ANNDL/Homework2/training_data.npy')
categories = np.load('/gdrive/MyDrive/ANNDL/Homework2/categories.npy')
valid_periods = np.load('/gdrive/MyDrive/ANNDL/Homework2/valid_periods.npy')

In [None]:
train_data.shape

(48000, 2776)

# Data Visualization

In [None]:
def plot_each_category(dataset, categories=categories, valid_periods=valid_periods):
  unique_categories, indices_unique_categories = np.unique(categories, return_index=True)
  figs, axs = plt.subplots(len(unique_categories), 1, sharex=True, figsize=(17,17))
  for i, category in enumerate(unique_categories):
    idx = indices_unique_categories[i]
    axs[i].plot(dataset[idx][valid_periods[idx][0]:valid_periods[idx][1]])
    axs[i].set_title(category)
  plt.show()

In [None]:
plot_each_category(train_data)

# Data cleaning

In [None]:
#converting to dataframe
df = pd.DataFrame(train_data).astype('float32')
categories_dataframe = pd.DataFrame(categories)
valid_dataframe = pd.DataFrame(valid_periods)

In [None]:
import pandas as pd
from tqdm import tqdm

interpolated_df = []

for i in tqdm(range(len(df))):
  row_data = df.iloc[i, :]
  # Calcola l'interpolazione tra ogni coppia di elementi consecutivi
  interpolated_values = []

  for i, pair in enumerate(zip(row_data[:-1], row_data[1:])):
      val = round((pair[0] + pair[1]) / 2, 6)
      if i == 0:
        interpolated_values.append(pair[0])
        interpolated_values.append(val)
        interpolated_values.append(pair[1])
      else:
        interpolated_values.append(val)
        interpolated_values.append(pair[1])
  interpolated_df.append(interpolated_values)
interpolated_df = pd.DataFrame(interpolated_df)


 94%|█████████▍| 45028/48000 [03:58<00:11, 249.92it/s]

In [None]:
interpolated_df

In [None]:
#clean the periods that have time series too short
def cleaning_period(valid_df):
  valid_filtered = valid_df[valid_df.iloc[:,1] - valid_df.iloc[:,0] >= 105] # thanks to interpolation we can remove less time series
  return valid_filtered

In [None]:
#cleaning dataframe and categories
valid_clean = cleaning_period(valid_dataframe)
categories_clean = categories_dataframe.iloc[valid_clean.index,:]
df_clean = df.iloc[valid_clean.index, :]