# Analyse EF repartition in EchoNet-Dynamic

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


data =  pd.read_csv('data/EchoNet-Dynamic/FileList.csv')
data.head()

In [None]:
print(f"Number of files: {len(data)}")
print(f"Number of train files: {len(data[data['Split'] == 'TRAIN'])}")
print(f"Number of test files: {len(data[data['Split'] == 'TEST'])}")
print(f"Number of val files: {len(data[data['Split'] == 'VAL'])}")


In [None]:
data_train_ef = data[data['Split'] == 'TRAIN']['EF'].to_list()
print(f"Number of train files with EF: {len(data_train_ef)}")

In [None]:
counts, bins, _= plt.hist(data_train_ef, bins=100, range=(0, 100))
plt.show()

In [None]:
min_ef = np.where(counts > 0)[0][0]
max_ef = np.where(counts > 0)[0][-1]
mean_ef = np.mean(data_train_ef)
std_ef = np.std(data_train_ef)
max_count = np.max(counts)
most_rep = np.argmax(counts)

print(f"Min EF: {min_ef}")
print(f"Max EF: {max_ef}")
print(f"Mean EF: {mean_ef}")
print(f"Std EF: {std_ef}")
print(f"Most Represented bin: {most_rep} (x{int(max_count)})")

# Prepare list of EF scores needed to balance relevant bins to 100 samples

In [None]:
count_to_balance = (np.minimum(max_count,100) - counts)#* (counts > 0)
count_to_balance = np.maximum(count_to_balance, 0)
count_to_balance[:10] = 0
count_to_balance[-10:] = 0
count_to_balance

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))

ax.bar(np.arange(len(counts)), counts)
ax.bar(np.arange(len(counts)), count_to_balance, bottom=counts)

plt.show()

In [None]:
# Total number of videos to generate in order to balance the dataset
sum(count_to_balance)

In [None]:
# Generate fixed list of EF values
generated_ef = [[float(i)]*int(count_to_balance[i]) for i in range(len(count_to_balance))]
generated_ef = [item for sublist in generated_ef for item in sublist]
generated_ef = np.array(generated_ef) + np.random.rand(len(generated_ef))
generated_ef

In [None]:
# Repartition of generated EF values
c,v,p = plt.hist(generated_ef, bins=100, range=(0, 100))
print(c, v)

In [None]:
# Sanity check
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(np.arange(len(counts)), counts)
ax.bar(np.arange(len(c)), c, bottom=counts)

plt.show()

In [None]:
# Save list of generated EF values to file
pd.DataFrame(generated_ef, columns=["Target EF"]).to_csv('diffusion/generate_samples/balanced_ef_list.csv', index=False)