In [1]:
import os
import zipfile
import numpy as np
import pandas as pd
from collections import defaultdict
# With defaultdict, missing keys are automatically initialized with a default value
from pathlib import Path
import pickle
from itertools import chain
import shutil



subject_matrices.pkl contains all the individual correlation matrixes for the 100 iterations

subject_statistics.pkl contains the summaries

In [None]:

zip_dir = "100iterations_fmriprep20"
corrupt_files = []  

for zip_file in os.listdir(zip_dir):
    if zip_file.endswith(".zip"):
        zip_path = os.path.join(zip_dir, zip_file)
        # print(f"Checking file: {zip_path}")
        try:
            with zipfile.ZipFile(zip_path, 'r') as test_zip:
                test_zip.testzip()  # Checks for corruption
                # print(f"{zip_file} is a valid ZIP file")
        except zipfile.BadZipFile:
            # print(f"{zip_file} is corrupt or not a ZIP file")
            corrupt_files.append(zip_file) 
        except Exception as e:
            # print(f"Error with {zip_file}: {e}")
            corrupt_files.append(zip_file) 

print("\SUMMARY")
if corrupt_files:
    print(f"Total Corrupt ZIP Files: {len(corrupt_files)}")
    print("List of Corrupt Files:")
    for file in corrupt_files:
        print(f"{file}")
else:
    print("No corrupt ZIP files found")



🚨 SUMMARY 🚨
🎉 No corrupt ZIP files found!


In [None]:
#checking which files are empty

zip_dir = "100iterations_fmriprep20"
for zip_file in os.listdir(zip_dir):
    zip_path = os.path.join(zip_dir, zip_file)
    print(f"{zip_file}: {os.path.getsize(zip_path)} bytes")



ds000201_sub-9040_time-20250304-214916_10299525_33.zip: 168529402 bytes
ds004712_sub-13192_time-20250304-175933_10299588_96.zip: 4194304 bytes
ds000201_sub-9040_time-20250304-214820_10299565_73.zip: 168147347 bytes
ds000201_sub-9040_time-20250304-214432_10299518_26.zip: 168982227 bytes
ds004712_sub-13192_time-20250304-180036_10299549_57.zip: 0 bytes
ds003540_sub-01_time-20250304-195101_10299518_26.zip: 195928544 bytes
ds004712_sub-13192_time-20250304-175905_10299493_1.zip: 0 bytes
ds000201_sub-9040_time-20250304-215041_10299514_22.zip: 168586058 bytes
ds002422_sub-09_time-20250304-203752_10299508_16.zip: 173902828 bytes
ds004712_sub-13192_time-20250304-180032_10299494_2.zip: 4194304 bytes
ds003540_sub-01_time-20250304-195149_10299552_60.zip: 194179246 bytes
ds002422_sub-09_time-20250304-204024_10299556_64.zip: 172808923 bytes
ds004712_sub-13192_time-20250304-175854_10299568_76.zip: 157593428 bytes
ds002422_sub-09_time-20250304-204112_10299540_48.zip: 172546914 bytes
ds000201_sub-9040_t

In [5]:
def extract_matrices(zip_path, extract_dir):
    '''
    Function to extract and read TSV files from one ZIP file.
    We return a dictionary that organizes the matrices per pipeline.
    '''
    
    #!  Make sure to remove old extracted files so we dont stack up matrices from other subjects
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)  
    os.makedirs(extract_dir, exist_ok=True) 
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    
    matrices = defaultdict(list)
    total_matrices = 0  # Track the total number of matrices extracted

    for root, dirs, files in os.walk(extract_dir):
        for file in files:
            if file.endswith("_desc-correlation_matrix.tsv"):
                pipeline = file.split("feature-")[1].split("CorrMatrix")[0]
                file_path = os.path.join(root, file)
                try:
                    matrix = np.loadtxt(file_path, delimiter="\t")
                    if np.issubdtype(matrix.dtype, np.number):
                        matrices[pipeline].append(matrix)
                        total_matrices += 1
                    else:
                        print(f"Non-numeric data found in file: {file_path}")
                except ValueError as e:
                    print(f"Error loading {file_path}: {e}")

    print(f"Total matrices extracted from {zip_path}: {total_matrices}")
    
    return matrices


def process_all_matrices(zip_dir, extract_dir):
    '''
    Processes all ZIP files in a given directory, extracts correlation matrices, 
    and organizes them by subject and pipeline.
    This function implements logic to skip non-zip and empty (corrupted) files as well.

    '''

    all_subjects = {}
    for zip_file in os.listdir(zip_dir):
        zip_path = os.path.join(zip_dir, zip_file)
        if not zip_file.endswith(".zip"):
            print(f"Skipping non-ZIP file: {zip_file}")
            continue
        if os.path.getsize(zip_path) == 0:
            print(f"Skipping empty ZIP file: {zip_file}")
            continue  
        try:
            with zipfile.ZipFile(zip_path, 'r') as test_zip:
                test_zip.testzip()  # Check if it's a valid ZIP
            subject_id = zip_file.split("_")[1]  # Extract subject ID, assuming that is placed at the beginining of zipfile
            matrices = extract_matrices(zip_path, extract_dir) 
            print(f"✅ Processed {subject_id} from {zip_file}")
            if subject_id not in all_subjects:
                all_subjects[subject_id] = {}
            for pipeline, matrix_list in matrices.items():
                if pipeline not in all_subjects[subject_id]:
                    all_subjects[subject_id][pipeline] = []
                all_subjects[subject_id][pipeline].extend(matrix_list)
        except zipfile.BadZipFile:
            print(f"Skipping corrupt ZIP file: {zip_file}")
        except Exception as e:
            print(f"Unexpected error processing {zip_file}: {e}")

    return all_subjects

### 1. Create fm20_feature-matrices.pkl

In [7]:
zip_dir = "100iterations_fmriprep20" 
temp_extract_dir = "100iterations_extracted"  

all_subjects = process_all_matrices(zip_dir, temp_extract_dir)
print(f"Subjects processed: {list(all_subjects.keys())}")
print(f"Pipelines processed: {sorted(set(chain.from_iterable(d.keys() for d in all_subjects.values())))}")

with open("fm20_feature-matrices.pkl", "wb") as f:
    pickle.dump(all_subjects, f)

Total matrices extracted from 100iterations_fmriprep20/ds000201_sub-9040_time-20250304-214916_10299525_33.zip: 14
✅ Processed sub-9040 from ds000201_sub-9040_time-20250304-214916_10299525_33.zip
Total matrices extracted from 100iterations_fmriprep20/ds004712_sub-13192_time-20250305-181219_10320909_44.zip: 14
✅ Processed sub-13192 from ds004712_sub-13192_time-20250305-181219_10320909_44.zip
Total matrices extracted from 100iterations_fmriprep20/ds002422_sub-09_time-20250305-204850_10320919_54.zip: 14
✅ Processed sub-09 from ds002422_sub-09_time-20250305-204850_10320919_54.zip
Total matrices extracted from 100iterations_fmriprep20/ds000201_sub-9040_time-20250304-214820_10299565_73.zip: 14
✅ Processed sub-9040 from ds000201_sub-9040_time-20250304-214820_10299565_73.zip
Total matrices extracted from 100iterations_fmriprep20/ds004712_sub-13192_time-20250305-181012_10320887_22.zip: 14
✅ Processed sub-13192 from ds004712_sub-13192_time-20250305-181012_10320887_22.zip
Total matrices extracted 

KeyboardInterrupt: 