In [None]:
from google.colab import drive
import pandas as pd
import os
import wave
import numpy as np
import re

In [None]:
drive.mount('/content/drive') # Change if needed
raw_database_path = "" # Write the raw_database/ path
databases_infos_path = "" # Write a directory path where you are reading and saving .xlsx files with databases informations.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
database_info = pd.read_excel(os.path.join(raw_database_path, "FetalPCGSpreadsheet.xlsx"))

In [None]:
print(database_info)

    Subject ID Number of Gravid/Alive/Abortion  Maternal BMI  \
0       F93001                             NaN          28.5   
1       F93002                           2/0/1          29.7   
2       F93003                           1/0/0          32.9   
3       F93004                           1/0/0          27.7   
4       F93005                           1/0/0          21.5   
..         ...                             ...           ...   
127     F93108                           3/1/1          36.9   
128     F93109                           2/1/0          28.3   
129     F93110                           2/1/0          37.9   
130     F93111                           3/2/0          35.0   
131     F93112                           2/1/0          38.9   

     Pregnancy Term (weeks) Fetus gender (B: Boy, G: Girl)  \
0                      32.0                              B   
1                      40.0                              B   
2                      37.0                  

In [None]:
# Selects the columns that will be used and renames the BPM column, specifically the patient identifier ('Subject ID') and the heart rate ('CTG Heart-rate (BPM). Each number corresponds to the average fetal heart rate over 10 seconds of the signal, whenever available. Brackets denote unreported values')
database_info = database_info.rename(columns={'CTG Heart-rate (BPM). Each number corresponds to the average fetal heart-rate over 10 seconds of the signal, whenever available. Brackets denote unreported values': 'BPM'})
database_info = database_info[['Subject ID', 'BPM']]
print(database_info)

    Subject ID                             BPM
0       F93001                 132-148-150-150
1       F93002         150-147-152-150-148-150
2       F93003              []-142-130-135-136
3       F93004  []-152-144-142-143-138-135-131
4       F93005         150-156-156-154-158-155
..         ...                             ...
127     F93108                             NaN
128     F93109                             NaN
129     F93110                             NaN
130     F93111                             NaN
131     F93112                             NaN

[132 rows x 2 columns]


In [None]:
# Verify number of empty data
database_info.isna().sum()

Unnamed: 0,0
Subject ID,6
BPM,27


In [None]:
# Remove empty data from subject ID
database_info = database_info.dropna(subset=['Subject ID']) # This can be done because the instances without an identifier were used to add extra comments to the same patient.

print(database_info, "\n", database_info.isna().sum())

    Subject ID                             BPM
0       F93001                 132-148-150-150
1       F93002         150-147-152-150-148-150
2       F93003              []-142-130-135-136
3       F93004  []-152-144-142-143-138-135-131
4       F93005         150-156-156-154-158-155
..         ...                             ...
127     F93108                             NaN
128     F93109                             NaN
129     F93110                             NaN
130     F93111                             NaN
131     F93112                             NaN

[126 rows x 2 columns] 
 Subject ID     0
BPM           21
dtype: int64


In [None]:
# Adjusts the patient ID to match the file name, except for twins.
database_info["Subject ID"] = database_info["Subject ID"].astype(str) # Convert to string
database_info["Subject ID"] = database_info["Subject ID"].apply(lambda s_id: "f"+str(int(s_id[-3:])) if s_id.startswith("F93") else s_id)

print(database_info['Subject ID'])

0        f1
1        f2
2        f3
3        f4
4        f5
       ... 
127    f108
128    f109
129    f110
130    f111
131    f112
Name: Subject ID, Length: 126, dtype: object


In [None]:
# Renames the IDs of the twin instances where the notation is different. 
def transform_subject_id(subject_id):
    match = re.match(r'^(F?\d+-?\d*):?.*$', subject_id)
    if match:
        return 'f' + match.group(1).replace('F', '').lower()
    return subject_id

pattern = '^f\d+$'

database_info['Subject ID'] = database_info['Subject ID'].apply(transform_subject_id)

print(database_info['Subject ID'].to_string())

0          f1
1          f2
2          f3
3          f4
4          f5
5          f6
8          f7
9          f8
10         f9
12        f10
13        f11
14        f12
15        f13
16        f14
19        f15
20        f16
21        f17
22      f17-1
23      f17-2
25        f18
26        f19
27        f20
28        f21
29        f22
30        f23
31      f23-1
32      f23-2
33        f24
34        f25
35        f26
36        f27
37        f28
38        f29
39        f30
40        f31
41      f31-1
42      f31-2
43        f32
44        f33
45        f34
46        f35
47        f36
48        f37
49        f38
50        f39
51        f40
52        f41
53        f42
54        f43
55        f44
56        f45
57        f46
58        f47
59        f48
60        f49
61        f50
62      f50-1
63      f50-2
64        f51
65        f52
66        f53
67        f54
68        f55
69        f56
70        f57
71        f58
72        f59
73        f60
74        f61
75        f62
76        f63
77    

In [None]:
# Remove empty data from BPM
database_info = database_info.replace("[]", np.nan)
database_info = database_info.dropna(subset=['BPM'])
database_info = database_info.reset_index(drop=True) # The dropna causes the index to become non-sequential, so it is necessary to reset it.
print(database_info.to_string())

    Subject ID                                  BPM
0           f1                      132-148-150-150
1           f2              150-147-152-150-148-150
2           f3                   []-142-130-135-136
3           f4       []-152-144-142-143-138-135-131
4           f5              150-156-156-154-158-155
5           f6          138-132-135-140-132-138-143
6           f7          147-144-143-142-150-151-145
7           f8          137-139-132-138-132-126-128
8           f9               144-142-130-147-[]-140
9          f10  168-168-162-156-156-150-163-165-156
10         f11      138-140-144-144-146-137-135-141
11         f12  150-152-144-138-133-145-138-138-141
12         f13  180-168-183-171-177-162-159-173-176
13         f14  142-153-144-147-147-138-145-146-148
14         f15  144-147-150-148-153-150-148-150-144
15         f16  146-156-157-147-150-156-153-156-152
16       f17-1   139-149-157-153-[]-144-150-156-160
17       f17-2     154-147-150-144-[]-[]-120-[]-126
18         f

In [None]:
# Confirming that there are no missing data
database_info.isna().sum()

Unnamed: 0,0
Subject ID,0
BPM,0


In [None]:
# Put the recordings that are labeled (with BPM) into a list and compare how many recordings remain after removing those without a label.

lista_gravacoes_feto = list(database_info['Subject ID'])
print("Número de gravações do feto com BPM:", len(lista_gravacoes_feto))
lista_gravacoes_feto_dataset_completo = [file_name for file_name in os.listdir(raw_database_path) if file_name.endswith(".wav") and file_name.startswith('f')]
print("Número de gravações totais do feto:", len(lista_gravacoes_feto_dataset_completo))


Número de gravações do feto com BPM: 104
Número de gravações totais do feto: 120


In [None]:
# Adding the theoretical duration of the audio based on the number of BPM records per audio, as each record corresponds to 10 seconds.
database_info['suposed_duration'] = database_info['BPM'].apply(lambda x: (x.count("-") + 1)*10)
database_info

Unnamed: 0,Subject ID,BPM,suposed_duration
0,f1,132-148-150-150,40
1,f2,150-147-152-150-148-150,60
2,f3,[]-142-130-135-136,50
3,f4,[]-152-144-142-143-138-135-131,80
4,f5,150-156-156-154-158-155,60
...,...,...,...
99,f97,145-140-145-144-150-145,60
100,f98,138-132-131-129-132-133,60
101,f99,136-135-131-138-142-145,60
102,f100,126-127-131-125-130-144-125,70


Now adding more audio file information to dataset_info

In [None]:
def get_wav_info(file_name):
    with wave.open(file_name, 'rb') as wav_file:

        num_channels = wav_file.getnchannels()
        sample_width = wav_file.getsampwidth()
        frame_rate = wav_file.getframerate()
        num_frames = wav_file.getnframes()
        duration = num_frames / frame_rate
        return num_channels, sample_width, frame_rate, num_frames, duration

In [None]:
num_channels_list = []
sample_width_list = []
frame_rate_list = []
num_frames_list = []
duration_list = []

for subject_id in database_info['Subject ID']:
    file_name = os.path.join(raw_database_path, f"{subject_id}.wav")
    num_channels, sample_width, frame_rate, num_frames, duration = get_wav_info(file_name)
    num_channels_list.append(num_channels)
    sample_width_list.append(sample_width)
    frame_rate_list.append(frame_rate)
    num_frames_list.append(num_frames)
    duration_list.append(duration)

database_info['num_channels'] = num_channels_list
database_info['sample_width'] = sample_width_list
database_info['frame_rate'] = frame_rate_list
database_info['num_frames'] = num_frames_list
database_info['duration'] = duration_list

In [None]:
database_info

Unnamed: 0,Subject ID,BPM,suposed_duration,num_channels,sample_width,frame_rate,num_frames,duration
0,f1,132-148-150-150,40,2,2,16000,608160,38.010
1,f2,150-147-152-150-148-150,60,2,2,16000,968400,60.525
2,f3,[]-142-130-135-136,50,2,2,16000,774480,48.405
3,f4,[]-152-144-142-143-138-135-131,80,2,2,16000,1304400,81.525
4,f5,150-156-156-154-158-155,60,2,2,16000,979920,61.245
...,...,...,...,...,...,...,...,...
99,f97,145-140-145-144-150-145,60,2,2,16000,1444800,90.300
100,f98,138-132-131-129-132-133,60,2,2,16000,1445760,90.360
101,f99,136-135-131-138-142-145,60,2,2,16000,1446000,90.375
102,f100,126-127-131-125-130-144-125,70,2,2,16000,1531200,95.700


In [None]:
# Checking if the audio files are standardized.
print(database_info["num_channels"].value_counts())
print(database_info["frame_rate"].value_counts())
print(database_info["sample_width"].value_counts())

num_channels
2    104
Name: count, dtype: int64
frame_rate
16000    100
8000       4
Name: count, dtype: int64
sample_width
2    104
Name: count, dtype: int64


In [None]:
# Remove the frame_rate that is different
database_info = database_info[database_info["frame_rate"] != 8000]
print(database_info["frame_rate"].value_counts())

frame_rate
16000    100
Name: count, dtype: int64


In [None]:
# Checking the difference in durations.
database_info["duration_diff"] = database_info["duration"] - database_info["suposed_duration"]
database_info

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  database_info["duration_diff"] = database_info["duration"] - database_info["suposed_duration"]


Unnamed: 0,Subject ID,BPM,suposed_duration,num_channels,sample_width,frame_rate,num_frames,duration,duration_diff
0,f1,132-148-150-150,40,2,2,16000,608160,38.010,-1.990
1,f2,150-147-152-150-148-150,60,2,2,16000,968400,60.525,0.525
2,f3,[]-142-130-135-136,50,2,2,16000,774480,48.405,-1.595
3,f4,[]-152-144-142-143-138-135-131,80,2,2,16000,1304400,81.525,1.525
4,f5,150-156-156-154-158-155,60,2,2,16000,979920,61.245,1.245
...,...,...,...,...,...,...,...,...,...
99,f97,145-140-145-144-150-145,60,2,2,16000,1444800,90.300,30.300
100,f98,138-132-131-129-132-133,60,2,2,16000,1445760,90.360,30.360
101,f99,136-135-131-138-142-145,60,2,2,16000,1446000,90.375,30.375
102,f100,126-127-131-125-130-144-125,70,2,2,16000,1531200,95.700,25.700


In [None]:
print(database_info["duration"].describe())
print(database_info["suposed_duration"].describe())

count    100.000000
mean      83.910000
std       18.432324
min       28.650000
25%       90.225000
50%       90.412500
75%       90.753750
max      133.170000
Name: duration, dtype: float64
count    100.000000
mean      67.200000
std       18.261844
min       20.000000
25%       60.000000
50%       60.000000
75%       90.000000
max       90.000000
Name: suposed_duration, dtype: float64


In [None]:
# Saving the clean .xlsx file
database_info.to_excel(os.path.join(databases_infos_path, "database_info.xlsx"))