# Imports

In [46]:
import h5py
import os
import re
import zipfile
import pickle as pkl

from collections import defaultdict

In [38]:
file_path = "./bif.hdf5"
links_path = "./bifurcating_list.txt"
tsv_path = "./bif_drive.tsv"
output_dir = "./bif"
dataset_name = "bif"

# file_path = "./single.hdf5"
# links_path = "./single_list.txt"
# tsv_path = "./single_drive.tsv"
# output_dir = "./single"
# dataset_name = "single"
sample_exact_number = 2000

# Check missing samples

In [6]:
missing = False

with h5py.File(file_path, 'a') as file:
    current_len = len(file)
    print(f"[{dataset_name}] Dataset len: ",len(file))
    sample_names = [name for name in file.keys() if name.startswith('sample_')]

numbers = [re.search(r'_\d+', s).group()[1:] for s in sample_names]
for n in range(len(numbers)):
    counter = f'{n:04d}'
    if numbers[n] != counter:
        print(f"[{dataset_name}] Missing sample number: ", counter)
        missing_number = counter
        missing = True
        break

if not missing:
    print(f"[{dataset_name}] All samples are present")

[single] Dataset len:  2000
[single] All samples are present


# Complete dataset

In [7]:
if current_len < sample_exact_number:
    campione_origine = "sample_1000"   
    campione_destinazione = "sample_" + missing_number

    # Apri il file di origine in modalità lettura
    with h5py.File(file_path, 'a') as file:
        # Verifica se il campione di origine esiste
        if campione_origine not in file:
            error_string = f"Sample '{campione_origine}' doesn't exists"
            print(f"[{dataset_name}] " + error_string)
          
        else:
            file.copy(campione_origine, file, name = campione_destinazione)
            print(f"[{dataset_name}] New dataset len: ",len(file))
else:
    print(f"[{dataset_name}] All samples are present")

[single] All samples are present


# Split dataset

In [8]:
num_files = 100

# Apri il file HDF5 in modalità lettura
with h5py.File(file_path, 'r') as file:
    # Ottieni la lista dei nomi dei campioni
    sample_names = [name for name in file.keys()]

    # Calcola il numero di campioni per file
    samples_per_file = len(sample_names) // num_files
    remainder = len(sample_names) % num_files

    # Indice per tenere traccia in cui ci si trova nella lista dei campioni
    index = 0

    # Per ogni file, crea un nuovo file HDF5 e copia i campioni
    for i in range(num_files):
        # Calcola il numero di campioni da copiare in questo file
        samples_in_this_file = samples_per_file + int(i < remainder)

        # Crea il nuovo file HDF5
        output_file = output_dir + "/" + dataset_name + f"_{i + 1}.hdf5"
        with h5py.File(output_file, 'w') as output_file:
            # Copia i campioni nel nuovo file
            for j in range(samples_in_this_file):
                sample_name = sample_names[index]
                file.copy(sample_name, output_file)

                # Incrementa l'indice per il prossimo campione
                index += 1

print(f"[{dataset_name}] Data divided in {num_files} files hdf5")

[single] Data divided in 100 files hdf5


# Check results

In [9]:
def exctract_key(stringa):
    # Estrai il numero dalla stringa
    return int(stringa.split('_')[1].split('.')[0])

files = [
    os.path.join(root, file) for root, 
    dirs, files in os.walk(output_dir) for file in files
]

# Ordina la lista utilizzando la funzione di chiave personalizzata
sorted_files = sorted(files, key = exctract_key)

checksum = 0

for file in sorted_files:
    filename = file.split("\\")[-1]
    with h5py.File(file, 'r') as data_patch:
        data_len = len(data_patch)
        checksum += data_len
        first_elem = list(data_patch.keys())[0]
        last_elem = list(data_patch.keys())[-1]
        print(f"[{filename}] Samples: ",data_len)
        print(f"[{filename}] First sample",first_elem)
        print(f"[{filename}] First sample",last_elem)

if checksum == 2000:
    print(f"[{dataset_name}] Splitted successfully")
else:
    print(f"[{dataset_name}] Some error occours during splitting")

[single_1.hdf5] Samples:  20
[single_1.hdf5] First sample sample_0000
[single_1.hdf5] First sample sample_0019
[single_2.hdf5] Samples:  20
[single_2.hdf5] First sample sample_0020
[single_2.hdf5] First sample sample_0039
[single_3.hdf5] Samples:  20
[single_3.hdf5] First sample sample_0040
[single_3.hdf5] First sample sample_0059
[single_4.hdf5] Samples:  20
[single_4.hdf5] First sample sample_0060
[single_4.hdf5] First sample sample_0079
[single_5.hdf5] Samples:  20
[single_5.hdf5] First sample sample_0080
[single_5.hdf5] First sample sample_0099
[single_6.hdf5] Samples:  20
[single_6.hdf5] First sample sample_0100
[single_6.hdf5] First sample sample_0119
[single_7.hdf5] Samples:  20
[single_7.hdf5] First sample sample_0120
[single_7.hdf5] First sample sample_0139
[single_8.hdf5] Samples:  20
[single_8.hdf5] First sample sample_0140
[single_8.hdf5] First sample sample_0159
[single_9.hdf5] Samples:  20
[single_9.hdf5] First sample sample_0160
[single_9.hdf5] First sample sample_0179
[

# Save zip

In [10]:
files_hdf5 = [
    file for file in os.listdir(output_dir) if file.endswith('.hdf5')
]
for file_hdf5 in files_hdf5:
    # Costruisci il percorso completo del file HDF5
    percorso_file_hdf5 = output_dir + "/" + file_hdf5

    # Costruisci il percorso completo per il file zip
    percorso_file_zip = output_dir + f"/{os.path.splitext(file_hdf5)[0]}.zip"

    # Crea il file zip e aggiungi il file HDF5
    with zipfile.ZipFile(percorso_file_zip, 'w') as zipf:
        zipf.write(percorso_file_hdf5, file_hdf5)

# Clear data

In [11]:
files = [
    f for f in os.listdir(output_dir) 
    if os.path.isfile(os.path.join(output_dir, f))
]

# Filtra i file con estensione .hdf5
hdf5_files = [f for f in files if f.endswith(".hdf5")]

# Rimuovi ciascun file HDF5
for hdf5_file in hdf5_files:
    file_path = os.path.join(output_dir, hdf5_file)
    os.remove(file_path)
    print(f"File {hdf5_file} rimosso con successo.")

File single_1.hdf5 rimosso con successo.
File single_10.hdf5 rimosso con successo.
File single_100.hdf5 rimosso con successo.
File single_11.hdf5 rimosso con successo.
File single_12.hdf5 rimosso con successo.
File single_13.hdf5 rimosso con successo.
File single_14.hdf5 rimosso con successo.
File single_15.hdf5 rimosso con successo.
File single_16.hdf5 rimosso con successo.
File single_17.hdf5 rimosso con successo.
File single_18.hdf5 rimosso con successo.
File single_19.hdf5 rimosso con successo.
File single_2.hdf5 rimosso con successo.
File single_20.hdf5 rimosso con successo.
File single_21.hdf5 rimosso con successo.
File single_22.hdf5 rimosso con successo.
File single_23.hdf5 rimosso con successo.
File single_24.hdf5 rimosso con successo.
File single_25.hdf5 rimosso con successo.
File single_26.hdf5 rimosso con successo.
File single_27.hdf5 rimosso con successo.
File single_28.hdf5 rimosso con successo.
File single_29.hdf5 rimosso con successo.
File single_3.hdf5 rimosso con succ

# Save links list

In [36]:
# - Create google sheet
# - Open google scripts and paste:

# function myFunction() {
#   var ss=SpreadsheetApp.getActiveSpreadsheet();
#   var s=ss.getActiveSheet();
#   var c=s.getActiveCell();
#   var fldr=DriveApp.getFolderById("1GjOOJKpWZjl_IB0EWZueF9IVM9aT_WHW"); #<- Root folder ID
#   var files=fldr.getFiles();
#   var names=[],f,str;
#   while (files.hasNext()) {
#     f=files.next();
#     str='("' + f.getUrl() + '","' + f.getName() + '")';
#     names.push([str]);
#   }
#   s.getRange(c.getRow(),c.getColumn(),names.length).setValues(names);
# }


In [None]:
#from: https://drive.google.com/file/d/1_tjLpicJtQEAz43jhYMeE3TF4-WNxFVw/view?usp=drive_link
#to: https://drive.google.com/uc?export=download&id=1_tjLpicJtQEAz43jhYMeE3TF4-WNxFVw

In [90]:
old_links = defaultdict(int)

with open(tsv_path, 'r', encoding = 'utf-  8') as file_tsv:
    # Leggi ogni riga del file
    for riga in file_tsv:
        # Rimuovi eventuali spazi bianchi all'inizio e alla fine della riga
        riga = riga.strip()

        # Dividi la riga in colonne usando il separatore di tabulazione
        column = riga.split('\t')[0]
        splitted_column = column.strip("()").split(",")
        link = splitted_column[0]
        filename = splitted_column[1]
        old_links[filename] = link

chiavi_da_elimare = [
    chiave for chiave, 
    valore in old_links.items() if ".zip" not in chiave
]

# Elimina le chiavi che non rispettano la condizione
for chiave in chiavi_da_elimare:
    del old_links[chiave]

old_links = defaultdict(
    int, 
    sorted(old_links.items(), 
    key = lambda x: int(x[0].split('_')[1].split('.')[0]))
)

old_links

defaultdict(int,
            {'"bif_1.zip"': '"https://drive.google.com/file/d/1o1e_Vl3ub3fprAPjoMYh4K-rXWkmjM5t/view?usp=drivesdk"',
             '"bif_2.zip"': '"https://drive.google.com/file/d/1b7ns8iKaTZUJrY6kxYqZyHQL6kajzeRR/view?usp=drivesdk"',
             '"bif_3.zip"': '"https://drive.google.com/file/d/1mPRiznvvI_PYZG2yXDt-GLDzc2bqDbDw/view?usp=drivesdk"',
             '"bif_4.zip"': '"https://drive.google.com/file/d/1Hn-dO8vc-a1RnUgkZ5kijbrcKjdDeUzX/view?usp=drivesdk"',
             '"bif_5.zip"': '"https://drive.google.com/file/d/1CnK0VtdO191eww2C1_c8B44IjLAErPk8/view?usp=drivesdk"',
             '"bif_6.zip"': '"https://drive.google.com/file/d/1WzLcQwSpSc0ntmYJ09sBRby9jQ1nMQ3d/view?usp=drivesdk"',
             '"bif_7.zip"': '"https://drive.google.com/file/d/1-OWiVkSLX7B15YHS_8w_RzHuGtCLJQGq/view?usp=drivesdk"',
             '"bif_8.zip"': '"https://drive.google.com/file/d/1zTXVy5lOY-HjDMQXuO8ANynEH_cwSkUC/view?usp=drivesdk"',
             '"bif_9.zip"': '"https://drive.goo

In [100]:
final_links = []

for _,link in old_links.items():
        splitted_link = list(
            filter(
                lambda x: x != '', 
                link.strip().split("/")
             )
         )
        file_id = splitted_link[-2]
        prefix = splitted_link[:2]
        command = "/uc?export=download&id="+file_id
        header = prefix[0][1:] + "//" + prefix[1]
        final_link = "".join(header + command)
        final_links.append(final_link)

len(final_links)

100

In [99]:
# Salva la lista come file pickle
with open(links_path.replace(".txt",".pkl"), 'wb') as file_pkl:
    pkl.dump(final_links, file_pkl)

In [103]:
with open(links_path.replace(".txt",".pkl"), 'rb') as file_pkl:
    fff = pkl.load(file_pkl)
    print(fff)

['https://drive.google.com/uc?export=download&id=1o1e_Vl3ub3fprAPjoMYh4K-rXWkmjM5t', 'https://drive.google.com/uc?export=download&id=1b7ns8iKaTZUJrY6kxYqZyHQL6kajzeRR', 'https://drive.google.com/uc?export=download&id=1mPRiznvvI_PYZG2yXDt-GLDzc2bqDbDw', 'https://drive.google.com/uc?export=download&id=1Hn-dO8vc-a1RnUgkZ5kijbrcKjdDeUzX', 'https://drive.google.com/uc?export=download&id=1CnK0VtdO191eww2C1_c8B44IjLAErPk8', 'https://drive.google.com/uc?export=download&id=1WzLcQwSpSc0ntmYJ09sBRby9jQ1nMQ3d', 'https://drive.google.com/uc?export=download&id=1-OWiVkSLX7B15YHS_8w_RzHuGtCLJQGq', 'https://drive.google.com/uc?export=download&id=1zTXVy5lOY-HjDMQXuO8ANynEH_cwSkUC', 'https://drive.google.com/uc?export=download&id=1IBMZhnetwRe27x0sPX_R1p-dgjGYEoYW', 'https://drive.google.com/uc?export=download&id=1IFJRPVn-v5cXYGle70J5ru96ifanYCUt', 'https://drive.google.com/uc?export=download&id=1a8ePXP5jYlAeCRYqQMTYUuEf-qBRCbZ1', 'https://drive.google.com/uc?export=download&id=1XKfDi4idOQc6APD1L1ms5-smA_