In [None]:
!pip install kagglehub



In [None]:
import os
import shutil
import kagglehub
import random
import zipfile

from tqdm import tqdm
from google.colab import drive

In [None]:
def count_files_in_dir(dir_name: str) -> str:
    """Counts the number of files in a directory"""
    return len(
        [f for f in os.listdir(dir_name) if os.path.isfile(os.path.join(dir_name, f))]
    )

def get_split_pos(n_items: int, train: float, test: float = -1.) -> tuple[int, int]:
    """makes train-test split positions. if 'test' fraction not specified,
    then all the items out of the 'train' set will be used as 'test' set.
    otherwise, rest of the items can be used as 'validation' set
    """
    n_train = int(n_items * train)
    return (n_train, int(n_items * test) if test > 0 else n_items - n_train)


Kagglehub package updates requires no credentials to download a free dataset, updates dataset automated, not download it again, if it wasn't changed.

Google colab has it's own storage space, accessible by path '/content'. This space is very fast, quite huge, but will be deleted as session will close. Google Drive disk can be connected to this space and them will be available as directory. But google drive is relatively slow to operate from collab and has limit of th einput-output operations for one day. solution is: use collab own space for all file operations. if we need a file from GDrive, mount it and copy to the collab space, if we want to save something for a while, mount GDrive and copy file there.

Here I will collect all the data in colab space, will create a tar.gz file with train-test-split and will copy the result to the GDrive

In [None]:
os.chdir('/content')

In [None]:
original_dataset = os.path.join(
    kagglehub.dataset_download("aryansinghal10/alzheimers-multiclass-dataset-equal-and-augmented"),
    'combined_images'
)

original_dirs = (
    ('NonDemented', os.path.join(original_dataset, 'NonDemented')),
    ('MildDemented', os.path.join(original_dataset, 'MildDemented')),
    ('ModerateDemented', os.path.join(original_dataset, 'ModerateDemented')),
    ('VeryMildDemented', os.path.join(original_dataset, 'VeryMildDemented')),
)

for name, f_dir in original_dirs:
    file_count = count_files_in_dir(f_dir)
    print(f"Number of files in {name}: {file_count}")

Number of files in NonDemented: 12800
Number of files in MildDemented: 10000
Number of files in ModerateDemented: 10000
Number of files in VeryMildDemented: 11200


In [None]:
output_base = 'AlzheimersData_Split'

classes = os.listdir(original_dataset)

# remove files from the previous runs
shutil.rmtree(output_base, ignore_errors=True)

# Ensure output folders exist
for split in ['train', 'val', 'test']:
    for cls in classes:
        os.makedirs(os.path.join(output_base, split, cls), exist_ok=True)

# Split and copy files
for cls in classes:
    cls_path = os.path.join(original_dataset, cls)

    images = os.listdir(cls_path)
    random.shuffle(images)

    n_train, n_test = get_split_pos(len(images), 0.7, 0.15)

    train_imgs = images[ : n_train]
    test_imgs = images[n_train : n_train + n_test]
    val_imgs = images[n_train + n_test : ]

    for img_list, split in zip([train_imgs, val_imgs, test_imgs], ['train', 'val', 'test']):
        for img in tqdm(img_list, desc=f'Copy {split} set, {cls} files'):
            src = os.path.join(cls_path, img)
            dst = os.path.join(output_base, split, cls, img)
            shutil.copy(src, dst)

Copy train set, ModerateDemented files: 100%|██████████| 7000/7000 [00:24<00:00, 287.69it/s]
Copy val set, ModerateDemented files: 100%|██████████| 1500/1500 [00:05<00:00, 273.34it/s]
Copy test set, ModerateDemented files: 100%|██████████| 1500/1500 [00:05<00:00, 272.15it/s]
Copy train set, NonDemented files: 100%|██████████| 8960/8960 [00:31<00:00, 281.73it/s]
Copy val set, NonDemented files: 100%|██████████| 1920/1920 [00:07<00:00, 244.61it/s]
Copy test set, NonDemented files: 100%|██████████| 1920/1920 [00:07<00:00, 240.29it/s]
Copy train set, VeryMildDemented files: 100%|██████████| 7839/7839 [00:24<00:00, 324.66it/s]
Copy val set, VeryMildDemented files: 100%|██████████| 1681/1681 [00:05<00:00, 322.12it/s]
Copy test set, VeryMildDemented files: 100%|██████████| 1680/1680 [00:04<00:00, 349.47it/s]
Copy train set, MildDemented files: 100%|██████████| 7000/7000 [00:09<00:00, 717.21it/s]
Copy val set, MildDemented files: 100%|██████████| 1500/1500 [00:01<00:00, 791.81it/s]
Copy test s

In [None]:
split_set_dirs = {
    'train': os.path.join(output_base, 'train'),
    'test': os.path.join(output_base, 'test'),
    'val': os.path.join(output_base, 'val')
}

for split_name, split_dir in split_set_dirs.items():
    count = 0
    print(f"Number of files in {split_name} set:")
    for cls in ('MildDemented', 'ModerateDemented', 'NonDemented', 'VeryMildDemented'):
        file_count = count_files_in_dir(os.path.join(split_dir, cls))
        count += file_count
        print(f"    {name}: {file_count}")

    print(f"  Total: {count}")
    print()


Number of files in train set:
    VeryMildDemented: 7000
    VeryMildDemented: 7000
    VeryMildDemented: 8960
    VeryMildDemented: 7839
  Total: 30799

Number of files in test set:
    VeryMildDemented: 1500
    VeryMildDemented: 1500
    VeryMildDemented: 1920
    VeryMildDemented: 1680
  Total: 6600

Number of files in val set:
    VeryMildDemented: 1500
    VeryMildDemented: 1500
    VeryMildDemented: 1920
    VeryMildDemented: 1681
  Total: 6601



In [None]:
def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Store relative path inside the zip
                arcname = os.path.relpath(file_path, start=folder_path)
                zipf.write(file_path, arcname)

# Example usage:
folder_to_zip = 'AlzheimersData_Split'
output_zip_path = 'AlzheimersData_Split.zip'

zip_folder(folder_to_zip, output_zip_path)
print("✅ Folder zipped successfully!")


✅ Folder zipped successfully!


In [31]:
drive.mount('/content/drive', force_remount=True)
shutil.copy('AlzheimersData_Split.zip', '/content/drive/MyDrive/AlzheimersData_Split.zip')
drive.flush_and_unmount()

Mounted at /content/drive
