### Splitting Flicker8k or Flicker30k dataset
- This is to split the text file from the normal Flicker8k and Flicker30k dataset

In [9]:
import pandas as pd

def convert_labels_to_dataframe(labels_path):

    image_names = []
    captions = []

    with open(labels_path, 'r') as file:
        for line in file:
            line_details = line.strip().split(' ', 1)  #split at the first space to handle captions with spaces
            if len(line_details) == 2:
                image_name = line_details[0].split('#') #remove #caption_num
                image_names.append(image_name[0].strip())
                captions.append(line_details[1].strip())
            else:
                print("Invalid line:", line)

    # Create a DataFrame
    data = pd.DataFrame({'image_filename': image_names, 'image_caption': captions})

    return data

caption_data = convert_labels_to_dataframe('../input/Flicker8k/Flickr8k.token.txt')
caption_data

Invalid line: 2428275562_4bde2bc5ea.jpg#0	A

Invalid line: 3640443200_b8066f37f6.jpg#0	a



Unnamed: 0,image_filename,image_caption
0,1000268201_693b08cb0e.jpg,child in a pink dress is climbing up a set of ...
1,1000268201_693b08cb0e.jpg,girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,little girl climbing the stairs to her playhou...
4,1000268201_693b08cb0e.jpg,little girl in a pink dress going into a woode...
...,...,...
40453,997722733_0cb5439472.jpg,man in a pink shirt climbs a rock face
40454,997722733_0cb5439472.jpg,man is rock climbing high in the air .
40455,997722733_0cb5439472.jpg,person in a red shirt climbing up a rock face ...
40456,997722733_0cb5439472.jpg,rock climber in a red shirt .


In [30]:
import shutil
import os
from tqdm import tqdm


def create_flicker_dataset(image_txt_files, folder_type, captions_data):
    image_path = f'../input/Flicker8k/{folder_type}/Images/'
    label_path = f'../input/Flicker8k/{folder_type}/Labels/'

    source_folder = f'../input/Flicker8k/Flicker8k_Dataset/'

    if not os.path.exists(image_path):
        os.makedirs(image_path)

    if not os.path.exists(label_path):
        os.makedirs(label_path)

    captions_dataset = pd.DataFrame(columns=['image_filename','image_caption'])

    with open(image_txt_files, 'r') as file:
        image_names = file.readlines()

    for image_name in tqdm(image_names):
        image_name = image_name.strip()
        source_path = os.path.join(source_folder, image_name)
        destination_path = os.path.join(image_path, image_name)

        filtered_captions = captions_data[captions_data['image_filename'] == image_name]
        captions_dataset = pd.concat([captions_dataset, filtered_captions], ignore_index=True)

        try:
            shutil.move(source_path, destination_path)
        except FileNotFoundError:
            print(f"File '{image_name}' not found in '{source_folder}'")
        except shutil.Error as e:
            print(f"Error occurred while moving '{image_name}': {e}")

    captions_dataset.to_csv(f"{label_path}Label.csv")
    print(f"Completed, {len(image_names)} images moved and {len(captions_dataset)} captions moved")
    return captions_dataset

In [31]:
create_flicker_dataset('../input/Flicker8k/Flickr_8k.trainImages.txt', 'Train', caption_data)

  0%|          | 0/6000 [00:00<?, ?it/s]

100%|██████████| 6000/6000 [03:00<00:00, 33.30it/s]


Completed, 6000 images moved and 30000 captions moved


In [32]:
create_flicker_dataset('../input/Flicker8k/Flickr_8k.testImages.txt', 'Test', caption_data)

100%|██████████| 1000/1000 [00:14<00:00, 70.42it/s]

Completed, 1000 images moved and 5000 captions moved





Unnamed: 0,image_filename,image_caption
0,3385593926_d3e9c21170.jpg,dogs are in the snow in front of a fence .
1,3385593926_d3e9c21170.jpg,dogs play on the snow .
2,3385593926_d3e9c21170.jpg,brown dogs playfully fight in the snow .
3,3385593926_d3e9c21170.jpg,brown dogs wrestle in the snow .
4,3385593926_d3e9c21170.jpg,dogs playing in the snow .
...,...,...
4995,3490736665_38710f4b91.jpg,big dog stands on his hand leg as tennis balls...
4996,3490736665_38710f4b91.jpg,brown and white dog in front of a shed overwhe...
4997,3490736665_38710f4b91.jpg,brown and white dogs stands in front of a wood...
4998,3490736665_38710f4b91.jpg,dog jumps for several tennis balls thrown at h...


In [33]:
create_flicker_dataset('../input/Flicker8k/Flickr_8k.devImages.txt', 'Validation', caption_data)

100%|██████████| 1000/1000 [00:10<00:00, 93.45it/s]

Completed, 1000 images moved and 5000 captions moved





Unnamed: 0,image_filename,image_caption
0,2090545563_a4e66ec76b.jpg,boy laying face down on a skateboard is being ...
1,2090545563_a4e66ec76b.jpg,girls play on a skateboard in a courtyard .
2,2090545563_a4e66ec76b.jpg,people play on a long skateboard .
3,2090545563_a4e66ec76b.jpg,small children in red shirts playing on a skat...
4,2090545563_a4e66ec76b.jpg,young children on a skateboard going across a ...
...,...,...
4995,522652105_a89f1cf260.jpg,girl playing is a pile of colorful balls .
4996,522652105_a89f1cf260.jpg,little girl plays in a ball pit .
4997,522652105_a89f1cf260.jpg,little girl plays in a pit of colorful balls .
4998,522652105_a89f1cf260.jpg,small girl is playing in a ball pit


### Splitting Filtered Flicker30k dataset
- This is to split the csv file from our filtered Flicker30k dataset

In [59]:
import pandas as pd
import os
import math
import random
import shutil

In [60]:
def split_flicker_dataset(csv_filepath:str, image_paths:str, output_paths:list, train_val_test_split:list):
    if sum(train_val_test_split) != 1:
        raise Exception("Train Test Validation Split should sum up to 1")

    dataset = pd.read_csv(csv_filepath)
    dataset.drop(columns="Unnamed: 0", inplace=True) #drop unamed column

    total_data = list(range(len(dataset)))
    random.shuffle(total_data)

    train_size = math.floor(train_val_test_split[0] * len(total_data))
    val_size = math.floor(train_val_test_split[1]* len(total_data))

    train_data = total_data[:train_size]
    val_data = total_data[train_size:val_size+train_size]
    test_data = total_data[val_size+train_size:]
    print(f"""training dataset size: {len(train_data)}
validation dataset size: {len(val_data)}
test dataset size: {len(test_data)}""")
    
    full_dataset = [train_data, val_data, test_data]
    
    for idx, data in enumerate(full_dataset):
        dataframe = dataset.iloc[data]
        dataframe.rename(columns={'image_filename': 'image_filename', 'caption': 'image_caption'}, inplace=True)
        dataframe.drop(columns=['label', 'confidence'], inplace=True)
        
        if not os.path.exists(f"{output_paths[idx]}/Images"):
            os.makedirs(f"{output_paths[idx]}/Images")
        
        if not os.path.exists(f"{output_paths[idx]}/Labels"):
            os.makedirs(f"{output_paths[idx]}/Labels")

        for index, row in dataframe.iterrows():
            image_name = row['image_filename']
            image_path =  f"{image_paths}/{image_name}"
            dest_path = f"{output_paths[idx]}/Images/{image_name}"

            shutil.copy(image_path, dest_path)

        dataframe.to_csv(f"{output_paths[idx]}/Labels/Label.csv")
        print("Saved Dataset")
            

In [61]:
split_flicker_dataset(csv_filepath="../../input/FilteredFlicker/flick30k_filtered_result.csv",
                              image_paths="../../input/FilteredFlicker/flicker30k_output_images",
                              output_paths=["../../input/FilteredFlicker/Train","../../input/FilteredFlicker/Validation", "../../input/FilteredFlicker/Test"],
                              train_val_test_split=[0.8, 0.1, 0.1])


training dataset size: 9431
validation dataset size: 1178
test dataset size: 1180


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'image_filename': 'image_filename', 'caption': 'image_caption'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.drop(columns=['label', 'confidence'], inplace=True)


Saved Dataset


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'image_filename': 'image_filename', 'caption': 'image_caption'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.drop(columns=['label', 'confidence'], inplace=True)


Saved Dataset


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.rename(columns={'image_filename': 'image_filename', 'caption': 'image_caption'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.drop(columns=['label', 'confidence'], inplace=True)


Saved Dataset
