# Data Processing on COMSOL data

This notebook serves to arrange the data extracted from COMSOL simulations. We extract the peak voltages obtained for each signal and then construct the associated labels (AP), a normalization to the data is also performed.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import glob
import seaborn as sns


path = "Data"
n_outputs, p_outputs = ("Volt_n1_V_out","Volt_n2_V_out","Volt_n3_V_out"),("Volt_p1_V_out","Volt_p2_V_out","Volt_p3_V_out")

headers = ["n1","n2","n3",
           "p1","p2","p3",
           "AP1", "AP2", "AP3", "AP4", "AP5", "AP6", "AP7", "AP8", "AP9"] # Ai == active point i

peaks_df = pd.DataFrame(columns= headers) # empty df
grid = np.array([1,2,3,4,5,6,7,8,9], dtype = int)
zero_active_points = np.zeros(len(grid), dtype = int)

In [None]:
# active_points = [a,b,c,d,e,f,g,h] where they take values 0 or 1, corresponding to [1,2,3,4,5,6,7,8,9] points in the grid

def get_label(file_name):

    active_points = zero_active_points.copy()

    left_bracket = file_name.find("(")
    right_bracket = file_name.find(")")
    try:
        points = (file_name[left_bracket+1 : right_bracket]).replace(",", "")
    except:
        print("Error found trying to replace ','.")
    try:
        points = points.replace(" ", "")
    except:
        print("No spaces found.")

    for point in points:
        active_index = np.where(grid == int(point))[0][0]
        active_points[active_index] = 1

    return active_points

def create_row(files):

    for file in files:
        try:
            df = pd.read_csv(file, index_col= None, sep = ";")
            df = df.drop(columns= ["PVDF1","PVDF2", "PVDF3", "PVDF4", "PVDF5", "PVDF6","PVDF7", "PVDF8", "PVDF9"])
            df = df.loc[df["Time"] < 10]

        except:
            try:
                df = df.drop(columns= ["Temp_PVDF_1", "Temp_PVDF_2"])
            except:
                print("Error in file: ", file)


        # Checks where the peak is and saves it to a variable
        n1_max = (df.loc[abs(df["Volt_n1_V_out"]) == max(abs(df["Volt_n1_V_out"]))][["Volt_n1_V_out"]]).values[0][0]
        n2_max = (df.loc[abs(df["Volt_n2_V_out"]) == max(abs(df["Volt_n2_V_out"]))][["Volt_n2_V_out"]]).values[0][0]
        n3_max = (df.loc[abs(df["Volt_n3_V_out"]) == max(abs(df["Volt_n3_V_out"]))][["Volt_n3_V_out"]]).values[0][0]

        p1_max = (df.loc[abs(df["Volt_p1_V_out"]) == max(abs(df["Volt_p1_V_out"]))][["Volt_p1_V_out"]]).values[0][0]
        p2_max = (df.loc[abs(df["Volt_p2_V_out"]) == max(abs(df["Volt_p2_V_out"]))][["Volt_p2_V_out"]]).values[0][0]
        p3_max = (df.loc[abs(df["Volt_p3_V_out"]) == max(abs(df["Volt_p3_V_out"]))][["Volt_p3_V_out"]]).values[0][0]

        features = [n1_max, n2_max, n3_max, p1_max, p2_max, p3_max]
        labels = get_label(file_name = file)

        row = np.concatenate((features, labels)) # joins the features "list" and the "labels" array

        # add row in the df
        global peaks_df
        peaks_df.loc[-1] = row  # adding a row
        peaks_df.index = peaks_df.index + 1  # shifting index
        peaks_df = peaks_df.sort_index()  # sorting by index

    peaks_df[headers[6:]] = peaks_df[headers[6:]].apply(np.int8) # set labels to integers

for i in range(1,10):
    the_path = path + f"/{i} points/"
    that_path_files = glob.glob(the_path + "/*.csv")
    create_row(files = that_path_files)
