# OSA Speech Recordings Preprocessing

### Dependencies

In [2]:
import pandas as pd
import os
import librosa
import matplotlib.pyplot as plt

### Load data

We're going to use the preprocessed data from the machine learning report, "Clinical_data.xlsx", which contains both male and female patients, and for which NaN values aren't removed but replaced with the mean.

In [3]:
# Clinical Data
file_path = "datasets/Clinical_data.xlsx"
df = pd.read_excel(file_path)

# Audio Data
audio_directory = "datasets/SPEECH_VOWELS/SPEECH_VOWELS"

In [26]:
len(os.listdir(audio_directory))

3399

### Combine the two datasets:

In [15]:
# Function to map the number to the corresponding vowel
def map_vowel(number):
    return {5: 'a', 6: 'e', 7: 'i', 8: 'o', 9: 'u'}.get(number, '')

In [35]:
# List to store the new rows
new_rows = []

# Iterate through the dataframe
for index, row in df.iterrows():
    patient_id = row['Patient']
    # Extract the number part from the Patient ID
    patient_number = str(int(patient_id[1:])) # We convert to integer then to str to remove the zeros before the number
    base_filename = f"LB{patient_number}_F"
    # Check for the presence of the required audio files
    required_files = [f"{base_filename}{i}R1.wav" for i in range(5, 10)]
    if all(os.path.isfile(os.path.join(audio_directory, file)) for file in required_files):
        for i in range(5, 10):
            print(f'Found file for Patient {base_filename}')
            new_row = row.copy()
            new_row['Wav_File'] = f"{base_filename}{i}R1.wav"
            new_row['Vowel'] = map_vowel(i)
            new_rows.append(new_row)



Found file for Patient LB4_F
Found file for Patient LB4_F
Found file for Patient LB4_F
Found file for Patient LB4_F
Found file for Patient LB4_F
Found file for Patient LB5_F
Found file for Patient LB5_F
Found file for Patient LB5_F
Found file for Patient LB5_F
Found file for Patient LB5_F
Found file for Patient LB6_F
Found file for Patient LB6_F
Found file for Patient LB6_F
Found file for Patient LB6_F
Found file for Patient LB6_F
Found file for Patient LB7_F
Found file for Patient LB7_F
Found file for Patient LB7_F
Found file for Patient LB7_F
Found file for Patient LB7_F
Found file for Patient LB8_F
Found file for Patient LB8_F
Found file for Patient LB8_F
Found file for Patient LB8_F
Found file for Patient LB8_F
Found file for Patient LB10_F
Found file for Patient LB10_F
Found file for Patient LB10_F
Found file for Patient LB10_F
Found file for Patient LB10_F
Found file for Patient LB11_F
Found file for Patient LB11_F
Found file for Patient LB11_F
Found file for Patient LB11_F
Found

In [36]:
# Create a new dataframe from the new rows
new_df = pd.DataFrame(new_rows)

In [38]:
new_df.shape

(3205, 10)

In [39]:
new_df.head()

Unnamed: 0,Patient,Gender,IAH,Weight,Age,Height,Cervical,BMI,Wav_File,Vowel
3,P0004,1,19.7,78.0,39.0,168.0,42.0,27.636054,LB4_F5R1.wav,a
3,P0004,1,19.7,78.0,39.0,168.0,42.0,27.636054,LB4_F6R1.wav,e
3,P0004,1,19.7,78.0,39.0,168.0,42.0,27.636054,LB4_F7R1.wav,i
3,P0004,1,19.7,78.0,39.0,168.0,42.0,27.636054,LB4_F8R1.wav,o
3,P0004,1,19.7,78.0,39.0,168.0,42.0,27.636054,LB4_F9R1.wav,u


In [40]:
# Save the new dataframe
new_df.to_excel("datasets/OSA_DB_UPM_vowels.xlsx", index=False)