# Create project data structure

In [2]:
# For dealing with files we use the built-in python module `Path`
# It provides a nice abstraction of the file system, compared to working with strings only.
# It also makes your code more portable, i.e. easier to share with someone using another operating system.
from pathlib import Path
# Some file system operation are not covered by 'Path' and we use 'shutil' for that
import shutil

# Regular expressions are used to find patterns in strings
import re

# For splitting the data
from sklearn.model_selection import train_test_split



In [3]:
# Path to Flickr8K_ photos
path_Flickr_jpg = "./flickr8k/Images"

image_all = Path.cwd() / path_Flickr_jpg

all_image_filenames = list(image_all.glob("*.jpg"))

print(len(all_image_filenames))
print(all_image_filenames[0])

8091
C:\Users\Wolfr\deep-machine-learning\project\flickr8k\Images\1000268201_693b08cb0e.jpg


In [8]:
split_ratio_dataset = 0.2

image_train, image_val = \
train_test_split(all_image_filenames,  
              test_size = split_ratio_dataset,
              random_state = 2)

print("The dataset will be comprised of:")
print(f"Training:\t{len(image_train)} \nValidation:\t{len(image_val)}")

The dataset will be comprised of:
Training:	6472 
Validation:	1619


In [10]:
subdirectories = {"./image_train": image_train,
                 "./image_val": image_val,
                 }

for subdirectory in subdirectories.keys():
    subdirectory = Path(subdirectory)
    subdirectory.mkdir(parents=True, exist_ok=True)
    
def fill_sub_dir(sub_dir, file_subset):
    """This function copies files from the `train_all` to a `<sub_dir>`
    A more efficient solution would be to use "symbolic links" (see https://kb.iu.edu/d/abbe)
    but for simplicity hard copies is used instead.
    """
    for file in file_subset:
        file_path = Path.cwd() / sub_dir / file.name
        shutil.copyfile(file, file_path)

for sub_dir, file_subset in subdirectories.items():
    fill_sub_dir(sub_dir, file_subset)

Now, test whether the images are splited:

In [34]:
train_path = "./image_train"
image_train = Path.cwd() / train_path
image_filenames = list(image_train.glob("*.jpg"))
print(len(image_filenames))

val_path = "./image_val"
image_val = Path.cwd() / val_path
image_filenames = list(image_val.glob("*.jpg"))
print(len(image_filenames))

6472
1619


Now split the txt file

In [36]:
# train_path = "./image_train"
# image_train = Path.cwd() / train_path
# val_path = "./image_val"
# image_val = Path.cwd() / val_path

# import os
# list_train = os.listdir(image_train)
# print(list_train[0])
# print(len(list_train))
# list_val = os.listdir(image_val)
# print(list_val[0])
# print(len(list_val))

1000268201_693b08cb0e.jpg
6472
1003163366_44323f5815.jpg
1619


In [37]:
# # Path to caption file
# import pandas as pd
# caption_file = './flickr8k/captions.txt'
# dataset = pd.read_csv(caption_file)