In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import mannwhitneyu
import re
import os
import seaborn as sns
from matplotlib import cm, colors

In [2]:
pd.set_option('display.max_columns', None) # Mostrar todas las columnas del DataFrame

## Preprocessing Regression 

In [3]:
df_values = pd.read_csv(r'aux_file\data_preprocess_regression.csv')

### Inteligibilidad

In [4]:
df_inteligibilidad = df_values.loc[:, ['ID', 'filename', 'Inteligibilidad']]

In [5]:
# # Calcula el rango intercuartílico (IQR)
# Q1 = df_inteligibilidad['Inteligibilidad'].quantile(0.25)
# Q3 = df_inteligibilidad['Inteligibilidad'].quantile(0.75)
# IQR = Q3 - Q1

# # Define los límites inferior y superior para los valores aceptables
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# mask_IQR = (df_inteligibilidad['Inteligibilidad'] >= lower_bound) & (df_inteligibilidad['Inteligibilidad'] <= upper_bound)

# # Filtra el DataFrame para eliminar los valores atípicos
# df_inteligibilidad = df_inteligibilidad[mask_IQR]

In [6]:
# Realizar el pivote y calcular la media
df_inteligibilidad = pd.pivot_table(
    df_inteligibilidad,
    index=['ID', 'filename'],
    values=['Inteligibilidad'],
    aggfunc='mean'
).reset_index().sort_values(by=['ID', 'filename'])

df_inteligibilidad

Unnamed: 0,ID,filename,Inteligibilidad
0,1,alonso-naturalreader.wav,4.9
1,2,hyacinthe-lovo-v3.wav,4.975
2,3,neural2C-pitchmodified-speedmodified.wav,4.447368
3,4,francisca-lovo-v2.wav,4.625
4,5,antoni-evenlabs-linguistico-v2.wav,4.55
5,6,lola-murf.wav,4.9
6,7,antonio-murf.wav,4.8
7,8,agata-lovo-pitchmodified.wav,4.4
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,4.820513
9,10,rosa-lovo-pitchmodified.wav,4.5


### Pronunciación

In [7]:
df_pronunciacion = df_values.loc[:, ['ID', 'filename', 'Pronunciacion']]

In [8]:
# Calcula el rango intercuartílico (IQR)
Q1 = df_pronunciacion['Pronunciacion'].quantile(0.25)
Q3 = df_pronunciacion['Pronunciacion'].quantile(0.75)
IQR = Q3 - Q1

# Define los límites inferior y superior para los valores aceptables
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

mask_IQR = (df_pronunciacion['Pronunciacion'] >= lower_bound) & (df_pronunciacion['Pronunciacion'] <= upper_bound)

# Filtra el DataFrame para eliminar los valores atípicos
df_pronunciacion = df_pronunciacion[mask_IQR]

In [9]:
# Realizar el pivote y calcular la media
df_pronunciacion = pd.pivot_table(
    df_pronunciacion,
    index=['ID', 'filename'],
    values=['Pronunciacion'],
    aggfunc='mean'
).reset_index().sort_values(by=['ID', 'filename'])

df_pronunciacion

Unnamed: 0,ID,filename,Pronunciacion
0,1,alonso-naturalreader.wav,4.641026
1,2,hyacinthe-lovo-v3.wav,4.641026
2,3,neural2C-pitchmodified-speedmodified.wav,3.882353
3,4,francisca-lovo-v2.wav,4.297297
4,5,antoni-evenlabs-linguistico-v2.wav,4.142857
5,6,lola-murf.wav,4.475
6,7,antonio-murf.wav,4.475
7,8,agata-lovo-pitchmodified.wav,4.108108
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,4.564103
9,10,rosa-lovo-pitchmodified.wav,4.028571


### Acentuación

In [10]:
# Realizar el pivote y calcular la media
df_acentuacion = pd.pivot_table(
    df_values.loc[:, ['ID', 'filename', 'Acentuacion']],
    index=['ID', 'filename'],
    values=['Acentuacion'],
    aggfunc='mean'
).reset_index().sort_values(by=['ID', 'filename'])

df_acentuacion

Unnamed: 0,ID,filename,Acentuacion
0,1,alonso-naturalreader.wav,4.5
1,2,hyacinthe-lovo-v3.wav,4.525
2,3,neural2C-pitchmodified-speedmodified.wav,2.9
3,4,francisca-lovo-v2.wav,4.225
4,5,antoni-evenlabs-linguistico-v2.wav,3.2
5,6,lola-murf.wav,4.325
6,7,antonio-murf.wav,4.25
7,8,agata-lovo-pitchmodified.wav,3.65
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,4.225
9,10,rosa-lovo-pitchmodified.wav,2.95


### Naturalidad

In [11]:
# Realizar el pivote y calcular la media
df_naturalidad = pd.pivot_table(
    df_values.loc[:, ['ID', 'filename', 'Naturalidad']],
    index=['ID', 'filename'],
    values=['Naturalidad'],
    aggfunc='mean'
).reset_index().sort_values(by=['ID', 'filename'])

df_naturalidad

Unnamed: 0,ID,filename,Naturalidad
0,1,alonso-naturalreader.wav,2.925
1,2,hyacinthe-lovo-v3.wav,4.1
2,3,neural2C-pitchmodified-speedmodified.wav,1.675
3,4,francisca-lovo-v2.wav,2.975
4,5,antoni-evenlabs-linguistico-v2.wav,1.825
5,6,lola-murf.wav,3.375
6,7,antonio-murf.wav,3.2
7,8,agata-lovo-pitchmodified.wav,2.075
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,2.875
9,10,rosa-lovo-pitchmodified.wav,1.85


### Cadencia

In [12]:
# Realizar el pivote y calcular la media
df_cadencia = pd.pivot_table(
    df_values.loc[:, ['ID', 'filename', 'Cadencia']],
    index=['ID', 'filename'],
    values=['Cadencia'],
    aggfunc='mean'
).reset_index().sort_values(by=['ID', 'filename'])

df_cadencia

Unnamed: 0,ID,filename,Cadencia
0,1,alonso-naturalreader.wav,3.9
1,2,hyacinthe-lovo-v3.wav,4.175
2,3,neural2C-pitchmodified-speedmodified.wav,1.675
3,4,francisca-lovo-v2.wav,3.75
4,5,antoni-evenlabs-linguistico-v2.wav,1.6
5,6,lola-murf.wav,3.85
6,7,antonio-murf.wav,3.85
7,8,agata-lovo-pitchmodified.wav,2.2
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,3.2
9,10,rosa-lovo-pitchmodified.wav,2.05


In [13]:
# Combina los DataFrames utilizando el método merge
merged_df = df_inteligibilidad.merge(df_pronunciacion, on=['ID', 'filename'], how='left')\
               .merge(df_acentuacion, on=['ID', 'filename'], how='left')\
               .merge(df_naturalidad, on=['ID', 'filename'], how='left')\
               .merge(df_cadencia, on=['ID', 'filename'], how='left')

In [14]:
merged_df

Unnamed: 0,ID,filename,Inteligibilidad,Pronunciacion,Acentuacion,Naturalidad,Cadencia
0,1,alonso-naturalreader.wav,4.9,4.641026,4.5,2.925,3.9
1,2,hyacinthe-lovo-v3.wav,4.975,4.641026,4.525,4.1,4.175
2,3,neural2C-pitchmodified-speedmodified.wav,4.447368,3.882353,2.9,1.675,1.675
3,4,francisca-lovo-v2.wav,4.625,4.297297,4.225,2.975,3.75
4,5,antoni-evenlabs-linguistico-v2.wav,4.55,4.142857,3.2,1.825,1.6
5,6,lola-murf.wav,4.9,4.475,4.325,3.375,3.85
6,7,antonio-murf.wav,4.8,4.475,4.25,3.2,3.85
7,8,agata-lovo-pitchmodified.wav,4.4,4.108108,3.65,2.075,2.2
8,9,neural2C-GCP-linguistico-speed0.78-pitchmodifi...,4.820513,4.564103,4.225,2.875,3.2
9,10,rosa-lovo-pitchmodified.wav,4.5,4.028571,2.95,1.85,2.05


In [15]:
merged_df.to_csv(r'aux_file\data_regression.csv', index=False)