In [13]:
import torch
import torchvision.transforms as transforms
import os
import spacy
from collections import Counter
from datasets import Dataset as HuggingFaceDataset
import pandas as pd
import pickle
import numpy as np

## Building the vocabulary

In [14]:
nlp = spacy.load("en_core_web_sm")

data_path = "data\\train"
dataset_files = []
word_counts = Counter()

# Get all HuggingFace Datasets
for listed_file in os.listdir(data_path):
    if listed_file.endswith(".arrow"):
        dataset_files.append(listed_file)

# Get the number of rows in each file and calculate the total number of rows
for file in dataset_files:
    ds = HuggingFaceDataset.from_file(f"{data_path}\\{file}")
    # Das zählt nur die Buchstaben und deren Häufigkeit
    for sentence in ds["prompt"]:
        doc = nlp(sentence)
        for token in doc:
            if token.is_alpha:
                word_counts[token.text] = word_counts.get(token.text, 0) + 1

vocab = [word for word, count in word_counts.most_common(10000)]
vocab_size = len(vocab) + 3  # Increment vocab_size by 3 for <UNK>, <SOS> and <EOS> tags

word2index = {word: i+3 for i, word in enumerate(vocab)}  # Shift indices by 3 for <UNK> and <EOS>
word2index["<UNK>"] = 0
word2index["<SOS>"] = 1
word2index["<EOS>"] = 2

## Prompt Tokenization & Image Resizing

In [15]:
def tokenize(text):
    return [tok.text for tok in nlp(text)]

def tokens_to_indices(tokens):
    return [word2index["<SOS>"]] + [word2index.get(word, 0) for word in tokens] + [word2index["<EOS>"]]

def tokenize_and_index_prompts(prompts):
    return prompts.apply(tokenize).apply(tokens_to_indices).to_numpy()

In [16]:
def transform_images(images):
    transform = transforms.Compose([transforms.Resize((512,512)),
                                    transforms.ToTensor()])

    # Convert the PIL image to Torch tensor of size 512x512
    return images.apply(transform).to_numpy()

In [18]:
PREPROCESSED_DIR = "preprocessed"
os.makedirs(PREPROCESSED_DIR, exist_ok=True)  
total_index = 0
for i, file in enumerate(dataset_files):
        ds = pd.DataFrame(HuggingFaceDataset.from_file(f"{data_path}\\{file}"))
        
        transformed_images = transform_images(ds["image"])
        tokenized_prompts = tokenize_and_index_prompts(ds["prompt"])
        for i in range(len(ds)):
                np.save(f"{PREPROCESSED_DIR}\\image_{total_index}.npy", transformed_images[i])
                np.save(f"{PREPROCESSED_DIR}\\prompt_{total_index}.npy", tokenized_prompts[i])
                total_index += 1

In [19]:
with open("word2index.pkl", "wb") as f:
    pickle.dump(word2index, f)