# Clean data (filenames)

In [1]:
import os
from shutil import copy2

In [2]:
from pathlib import Path

In [3]:
root_src_dir = Path('..\data-first-test') # dir with original image files with random names.
root_dst_dir = Path('..\data-dev-test') # destination dir with logical image file names.

## Copy the folder structure

In [4]:
'''
- Iterate over existing folders
- Build the structure for the new folders based on existing ones
- Check, if the new folder structure does not exist
- If so, create new folder without files
'''
def copy_subfolders(src_path, dst_path):
    for dirpath, dirnames, filenames in os.walk(src_path):
        structure = os.path.join(dst_path, os.path.relpath(dirpath, src_path))
        if not os.path.isdir(structure):
            os.mkdir(structure)
            print(f'mkdir {structure}')
        else:
            print(f'{structure} does already exits.')

In [5]:
copy_subfolders(root_src_dir, root_dst_dir)

..\data-dev-test\. does already exits.
mkdir ..\data-dev-test\alef-1
mkdir ..\data-dev-test\bet-2
mkdir ..\data-dev-test\dalet-5
mkdir ..\data-dev-test\gimel-4
mkdir ..\data-dev-test\he-6
mkdir ..\data-dev-test\vet-3


## Copy files into another directory.

In [6]:
''' copy files into dst dir with a filename based on their parent directory name and an index number.'''
def copy_files(src_dir, dst_dir):
    src_p = Path(src_dir)
    dst_p = Path(dst_dir)
    letter,_ = os.path.basename(src_dir).split('-')
    
    for i,filename in enumerate(os.listdir(src_p)):
        src = src_p/filename
        dst = dst_p/f'{letter}-{i}-t.jpg'
        print(f'src: {src} ---> dst: {dst}')
        copy2(src, dst)

In [7]:
def get_subfolders(root_dir):
    return [Path(d.path) for d in os.scandir(root_dir) if d.is_dir() ]

In [8]:
for src_sf, dst_sf in zip(get_subfolders(root_src_dir), get_subfolders(root_dst_dir)):
    copy_files(src_sf, dst_sf)

src: ..\data-first-test\alef-1\1.png ---> dst: ..\data-dev-test\alef-1\alef-0-t.jpg
src: ..\data-first-test\alef-1\2.png ---> dst: ..\data-dev-test\alef-1\alef-1-t.jpg
src: ..\data-first-test\alef-1\3.png ---> dst: ..\data-dev-test\alef-1\alef-2-t.jpg
src: ..\data-first-test\alef-1\4.png ---> dst: ..\data-dev-test\alef-1\alef-3-t.jpg
src: ..\data-first-test\alef-1\alef-0-t.jpg ---> dst: ..\data-dev-test\alef-1\alef-4-t.jpg
src: ..\data-first-test\alef-1\alef-1-t.jpg ---> dst: ..\data-dev-test\alef-1\alef-5-t.jpg
src: ..\data-first-test\alef-1\alef-2-t.jpg ---> dst: ..\data-dev-test\alef-1\alef-6-t.jpg
src: ..\data-first-test\alef-1\alef-3-t.jpg ---> dst: ..\data-dev-test\alef-1\alef-7-t.jpg
src: ..\data-first-test\bet-2\1.png ---> dst: ..\data-dev-test\bet-2\bet-0-t.jpg
src: ..\data-first-test\bet-2\2.png ---> dst: ..\data-dev-test\bet-2\bet-1-t.jpg
src: ..\data-first-test\bet-2\3.png ---> dst: ..\data-dev-test\bet-2\bet-2-t.jpg
src: ..\data-first-test\bet-2\4.png ---> dst: ..\data-dev