# Fine Tuning Hugging Face Model

# Pre Processing Data

In [3]:
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import T5Tokenizer
import json

# Loading Data
data = []
with open("../Dataset/genz_to_corp_dataset.jsonl", "r") as f:
    for line in f:
        entry = json.loads(line)
        data.append({
            "input_text": f"translate informal to formal: {entry['input']}",
            "target_text": entry['output']
        })
# Converting to Hugging Face Dataset
raw_dataset = Dataset.from_list(data)

split_dataset = raw_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = split_dataset["test"].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]
})

# Tokenization
tokenizer = T5Tokenizer.from_pretrained("t5-small")
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_length,   
        padding="max_length",
        truncation=True
    )
    targets = tokenizer(
        examples["target_text"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
)

output_path = "../Dataset/CleanedData/genz_to_corp_dataset_tokenized"
tokenized_datasets.save_to_disk(output_path)
print(f"Tokenized dataset saved to {output_path}")

Map: 100%|██████████| 120/120 [00:00<00:00, 4000.00 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 2143.23 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 2143.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 120/120 [00:00<00:00, 30005.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15/15 [00:00<00:00, 4999.57 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15/15 [00:00<00:00, 5000.76 examples/s]

Tokenized dataset saved to ../Dataset/CleanedData/genz_to_corp_dataset_tokenized





In [5]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk("../Dataset/CleanedData/genz_to_corp_dataset_tokenized")

train = tokenized_datasets["train"]
validation = tokenized_datasets["validation"]
test = tokenized_datasets["test"]

print(f"Train size: {len(train)}"
      f"\nValidation size: {len(validation)}"
      f"\nTest size: {len(test)}")

Train size: 120
Validation size: 15
Test size: 15


# Validating Model

# Testing Model

# Pre Processing