### Configuração do notebook

In [120]:
import os

PATH_TO_DATASET = "/home/lozavival/Documents/AUDIOS-Dataset"

fake_audios_path = os.path.join(PATH_TO_DATASET, "fake_voices")
real_audios_path = os.path.join(PATH_TO_DATASET, "real_voices")

### Dicionário 1 - Listagem por Pessoa

In [None]:
people = {}

# For every spoofed folder, get the number of spoofed files
for folder in os.listdir(fake_audios_path):
    path = os.path.join(fake_audios_path, folder)
    files = os.listdir(path)
    
    person, ids, *_ = folder.split("_")
    gender = ids[0]

    people[person] = {
        "gender": gender,
        "id": ids,
        "spoof_count": len(files),
    }

# For every bonafide folder, get the number of bonafide files
for folder in os.listdir(real_audios_path):
    path = os.path.join(real_audios_path, folder)
    files = os.listdir(path)
    
    person, ids, *_ = folder.split("_")
    gender = ids[0]

    if people.get(person) is not None:
        people[person]["bonafide_count"] = len(files)
    else:
        # If the person is not in the dictionary, it means there are no spoof files
        people[person] = {
            "gender": gender,
            "id": ids,
            "spoof_count": 0,
            "bonafide_count": len(files),
        }

print(len(people))
print(list(people.items())[:5])

### Dicionário 2 - Listagem por Arquivo

In [121]:
all_files = []

for folder in os.listdir(fake_audios_path):
    path = os.path.join(fake_audios_path, folder)
    folder_files = os.listdir(path)
    
    person, ids, *_ = folder.split("_")
    gender = ids[0]
    for file in folder_files:
        file_path = os.path.join("fake_voices", folder, file)
        all_files.append([file_path, person, ids, gender, "spoof"])

for folder in os.listdir(real_audios_path):
    path = os.path.join(real_audios_path, folder)
    folder_files = os.listdir(path)
    
    person, ids, *_ = folder.split("_")
    gender = ids[0]
    for file in folder_files:
        file_path = os.path.join("real_voices", folder, file)
        all_files.append([file_path, person, ids, gender, "bona-fide"])

print(len(all_files))
print(all_files[:5])

179814
[['fake_voices/Paula_F026_Fake/590_fake.wav', 'Paula', 'F026', 'F', 'spoof'], ['fake_voices/Paula_F026_Fake/337_fake.wav', 'Paula', 'F026', 'F', 'spoof'], ['fake_voices/Paula_F026_Fake/469_fake.wav', 'Paula', 'F026', 'F', 'spoof'], ['fake_voices/Paula_F026_Fake/664_fake.wav', 'Paula', 'F026', 'F', 'spoof'], ['fake_voices/Paula_F026_Fake/72_fake.wav', 'Paula', 'F026', 'F', 'spoof']]


### Exporta Dicionário 1 para csv

In [None]:
import csv

fields = ["person", "gender", "id", "spoof_count", "bonafide_count"]
with open(os.path.join(PATH_TO_DATASET, "meta.csv"), "w") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    for person, data in people.items():
        writer.writerow([person, data["gender"], data["id"], data["spoof_count"], data["bonafide_count"]])
print("File written!")

### Exporta Dicionário 2 para csv

In [122]:
import csv

fields = ["file", "speaker", "id", "gender", "label"]
with open(os.path.join(PATH_TO_DATASET, "meta.csv"), "w") as f:
    writer = csv.writer(f)
    writer.writerow(fields)
    writer.writerows(all_files)
print("File written!")

File written!


### Exploração dos Dados

In [123]:
[x for x in os.listdir(PATH_TO_DATASET) if x.endswith('.csv')]

['meta.csv']

In [124]:
import pandas as pd

dataset_metadata_df = pd.read_csv(os.path.join(PATH_TO_DATASET, 'meta.csv'), keep_default_na=False)
dataset_metadata_df.head()

Unnamed: 0,file,speaker,id,gender,label
0,fake_voices/Paula_F026_Fake/590_fake.wav,Paula,F026,F,spoof
1,fake_voices/Paula_F026_Fake/337_fake.wav,Paula,F026,F,spoof
2,fake_voices/Paula_F026_Fake/469_fake.wav,Paula,F026,F,spoof
3,fake_voices/Paula_F026_Fake/664_fake.wav,Paula,F026,F,spoof
4,fake_voices/Paula_F026_Fake/72_fake.wav,Paula,F026,F,spoof


In [125]:
# load the durations of each audio
import librosa

# Function to get the duration
def get_duration(filename):
    try:
        audio_path = os.path.join(PATH_TO_DATASET, filename)
        y, sr = librosa.load(audio_path, sr=None)
        return librosa.get_duration(y=y, sr=sr)
    except Exception as e:
        print(f"Could not load file {filename}: {e}")
        return None

# Apply the function to get durations and add them as a new column
dataset_metadata_df['duration'] = dataset_metadata_df['file'].apply(get_duration)
print("duration of calculations done")

duration of calculations done


In [126]:
print("total audio clips:", dataset_metadata_df.duration.count())
print("mean duration of audio clips (seconds):", dataset_metadata_df.duration.mean())
print("N speakers:", dataset_metadata_df.speaker.nunique())
print("Total audio time (hours):", dataset_metadata_df.duration.sum() / 3600)

total audio clips: 179814
mean duration of audio clips (seconds): 4.916401206478175
N speakers: 101
Total audio time (hours): 245.56604626157403


In [127]:
print("audio samples per speaker")
dataset_metadata_df.speaker.value_counts()

audio samples per speaker


speaker
MarcosBittencourt    2000
JulioFaustino        2000
DenizeRamos          2000
Rodrigo              2000
DanielRibeiro        2000
                     ... 
Henrique             1000
EdsonCabral          1000
Geruza               1000
Emigoncalvez         1000
Elson                1000
Name: count, Length: 101, dtype: int64

In [128]:
print('Minutes per speaker')
dataset_metadata_df.groupby("speaker")['duration'].sum().sort_values(ascending=False) / 60

Minutes per speaker


speaker
JoseIldo         210.167957
TerezaSpedo      205.291261
Tulio            194.401519
AdrianaMalta     191.096592
AnnaPerez        187.529110
                    ...    
Henrique          85.454699
EdsonCabral       77.494906
Emigoncalvez      68.972505
EduardoTardin     64.391518
Elson             61.242120
Name: duration, Length: 101, dtype: float64

In [129]:
print("samples by class")
dataset_metadata_df['label'].value_counts()

samples by class


label
bona-fide    100998
spoof         78816
Name: count, dtype: int64

In [130]:
print("minutes by class")
dataset_metadata_df.groupby("label")['duration'].sum().sort_values(ascending=False) / 60

minutes by class


label
bona-fide    8679.425531
spoof        6054.537244
Name: duration, dtype: float64

In [131]:
# the proportions of samples and duration are diferrent, why?
print("duration mean of each class")
print("mean duration of spoof audios is bigger")
dataset_metadata_df.groupby("label")['duration'].mean()

duration mean of each class
mean duration of spoof audios is bigger


label
bona-fide    5.156196
spoof        4.609118
Name: duration, dtype: float64

In [132]:
print("Quantity of each audio per speaker")

spoof_per_speaker = dataset_metadata_df[dataset_metadata_df.label == 'spoof'].groupby("speaker").duration.count()
bonafide_per_speaker = dataset_metadata_df[dataset_metadata_df.label == 'bona-fide'].groupby("speaker").duration.count()

# Combine into a single DataFrame
counts_df = pd.DataFrame({
    'spoof_count': spoof_per_speaker,
    'bona_fide_count': bonafide_per_speaker
}).fillna(0)  # Fill NaN with 0 if some speakers have no 'spoof' or 'bona-fide' samples

# Calculate total count and ratio for each speaker
counts_df['total'] = counts_df['spoof_count'] + counts_df['bona_fide_count']
counts_df['spoof_ratio'] = counts_df['spoof_count'] / counts_df['total']

counts_df.sort_values("spoof_ratio")

Quantity of each audio per speaker


Unnamed: 0_level_0,spoof_count,bona_fide_count,total,spoof_ratio
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Emigoncalvez,0.0,1000,1000.0,0.0
EduardoPereira,0.0,1000,1000.0,0.0
EduardoTardin,0.0,1000,1000.0,0.0
EdsonCabral,0.0,1000,1000.0,0.0
Elson,0.0,1000,1000.0,0.0
...,...,...,...,...
SilvanaFerreira,1000.0,1000,2000.0,0.5
SandraRocha,1000.0,1000,2000.0,0.5
Rafael,1000.0,1000,2000.0,0.5
TerezaSpedo,1000.0,1000,2000.0,0.5


In [133]:
print("Minutes of each audio per speaker")

import pandas as pd

# Sum durations for each label per speaker, converting to minutes
spoof_per_speaker = dataset_metadata_df[dataset_metadata_df.label == 'spoof'].groupby("speaker").duration.sum() / 60
bonafide_per_speaker = dataset_metadata_df[dataset_metadata_df.label == 'bona-fide'].groupby("speaker").duration.sum() / 60

# Combine into a single DataFrame
duration_df = pd.DataFrame({
    'spoof_duration (min)': spoof_per_speaker,
    'bona_fide_duration (min)': bonafide_per_speaker
}).fillna(0)  # Fill NaN with 0 if some speakers have no 'spoof' or 'bona-fide' samples

# Calculate total duration and ratio for each speaker
duration_df['total_duration (min)'] = duration_df['spoof_duration (min)'] + duration_df['bona_fide_duration (min)']
duration_df['spoof_ratio'] = duration_df['spoof_duration (min)'] / duration_df['total_duration (min)']

# Sort by spoof ratio
duration_df = duration_df.sort_values("spoof_ratio")

duration_df.sort_values("spoof_ratio")


Minutes of each audio per speaker


Unnamed: 0_level_0,spoof_duration (min),bona_fide_duration (min),total_duration (min),spoof_ratio
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Emigoncalvez,0.000000,68.972505,68.972505,0.000000
EduardoPereira,0.000000,87.568842,87.568842,0.000000
EduardoTardin,0.000000,64.391518,64.391518,0.000000
EdsonCabral,0.000000,77.494906,77.494906,0.000000
Elson,0.000000,61.242120,61.242120,0.000000
...,...,...,...,...
Milena,79.580622,75.608156,155.188778,0.512799
Alessandra,78.716622,74.660711,153.377334,0.513222
Mariana,73.718400,69.405747,143.124147,0.515066
JeanCarlos,63.136533,57.312081,120.448615,0.524178
