# Dataset Processing
---
In questo notebook vengono elaborati e selezionati i dati per il progetto.


## Import Libraries and Set Style


In [1]:
import pandas as pd
import numpy as np

## Load and Manipulation Data


In [2]:
path = "/media/neurone-pc6/Volume/Michele/Prog_GAF_Michele/pythonProject/data/Vasiceksimulation_23_07.csv"
df = pd.read_csv(path)
df_data = df.iloc[:5000, :-1] # load the first 5000 rows
df_data

In [5]:
# Function to split a row into chunks
def split_into_chunks(row: list, chunk_size: int = 32) -> list:
    """
    Split a single row into smaller, evenly sized chunks.

    Parameters:
    - row (list): A single row (list of elements) to be split into smaller chunks.
    - chunk_size (int, optional): The desired size of each chunk. Default is 32.

    Returns:
    - list: A list containing sub-lists, where each sub-list represents a chunk 
            of the original row with a maximum length of chunk_size.
            
    """
    return [row[i:i + chunk_size] for i in range(0, len(row), chunk_size)]

# Convert dataframe to numpy matrix
data_matrix = df_data.to_numpy()

# Split each row of the data matrix into chunks
split_rows = [split_into_chunks(row) for row in data_matrix]

# Flatten the list of lists into a single list of rows
rows = [chunk for sublist in split_rows for chunk in sublist]

# Print the number of rows after splitting
print("Number of rows after splitting:", len(rows))


Numero di righe dopo la suddivisione: 160000


## Labels

In [6]:
# Function to calculate the label for each 32-value chunk
def calculate_label(chunk: list) -> int:
    """
    Calculate the label for a 32-value chunk.

    Parameters:
    - chunk (list): A list of 32 numerical values representing a single chunk.

    Returns:
    - int: Returns 1 if the last value in the chunk is greater than the first value; otherwise, returns 0.

    """
    return 1 if chunk[-1] > chunk[0] else 0

# Calculate labels for each chunk of 32 values
# Assumes that the chunks follow the same structure as original rows
labels = np.array([calculate_label(row) for row in rows])

# Create a DataFrame with the chunks of 32 values and the calculated labels
df_split = pd.DataFrame(rows)
df_split['Label'] = labels

# Display the first few rows of the resulting DataFrame
print(df_split.head())
print("Dimensions of the final DataFrame:", df_split.shape)

          0         1         2         3         4         5         6  \
0  0.047060  0.047069  0.047009  0.046852  0.046831  0.046630  0.046554   
1  0.046182  0.046456  0.046494  0.046501  0.046688  0.046745  0.046386   
2  0.046236  0.046164  0.046007  0.045581  0.045556  0.045429  0.045752   
3  0.044570  0.044530  0.044174  0.044174  0.044151  0.044099  0.044456   
4  0.042343  0.042331  0.042530  0.042311  0.042063  0.042377  0.042463   

          7         8         9  ...        23        24        25        26  \
0  0.046445  0.046598  0.046537  ...  0.045962  0.045824  0.045785  0.045910   
1  0.046201  0.046405  0.046059  ...  0.046389  0.046449  0.046491  0.046259   
2  0.045778  0.046061  0.046010  ...  0.044968  0.045023  0.044988  0.044796   
3  0.044281  0.044078  0.044001  ...  0.043598  0.043549  0.043174  0.043195   
4  0.042418  0.042493  0.042283  ...  0.042240  0.042374  0.042095  0.042125   

         27        28        29        30        31  Label  
0  0.04

## Save Data

In [8]:
#Save path
output_path = "/media/neurone-pc6/Volume/Michele/Prog_GAF_Michele/pythonProject/data/Vasiceksimulation_windows_23_07.csv"

# CSV file
df_split.to_csv(output_path, index=False)

print("Elaborazione completata e file salvato in:", output_path)

Elaborazione completata e file salvato in: /media/neurone-pc6/Volume/Michele/Prog_GAF_Michele/pythonProject/data/Vasiceksimulation_windows_23_07.csv
