# MODEL.PY

In [25]:
import yaml
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
import os

# Import Configuration
def load_model_config(config_path="/Users/frederikreimert/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/Kandidat_DTU/2024E/MLops/project_folder/ml_ops_project/configs/model/model_config.yaml"):
    with open(config_path, "r") as file:
        config_dict = yaml.safe_load(file)
    return T5Config(**config_dict)

# Load model
def initialize_model(config):
    model = T5ForConditionalGeneration(config)
    model.from_pretrained("google-t5/t5-small")
    return model

model = initialize_model(load_model_config())

# TRAIN.PY

In [11]:
import yaml
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from ml_ops_project.model import load_model_config, initialize_model
from ml_ops_project.data import Tokenize_data, OpusDataset
from torch.utils.data import Dataset
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorForSeq2Seq


In [23]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = Tokenize_data.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, Tokenize_data.pad_token_id)
    decoded_labels = Tokenize_data.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != Tokenize_data.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [9]:
# Load Data
train_data = OpusDataset("/Users/frederikreimert/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/Kandidat_DTU/2024E/MLops/project_folder/ml_ops_project/data/processed/train.txt")
test_data = OpusDataset("/Users/frederikreimert/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/Kandidat_DTU/2024E/MLops/project_folder/ml_ops_project/data/processed/test.txt")

train_dataloader = DataLoader(train_data, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)


In [26]:
data_collator = DataCollatorForSeq2Seq(tokenizer=Tokenize_data, model=model)

In [27]:
# Import Configuration
def load_training_config(config_path="/Users/frederikreimert/Library/CloudStorage/OneDrive-DanmarksTekniskeUniversitet/Kandidat_DTU/2024E/MLops/project_folder/ml_ops_project/configs/train/train_config.yaml"):
    with open(config_path, "r") as file:
        config_dict = yaml.safe_load(file)
    return Seq2SeqTrainingArguments(**config_dict)

training_args = load_training_config()

# Load Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader.dataset,
    eval_dataset=test_dataloader.dataset,
    processing_class=Tokenize_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



ValueError: fp16 mixed precision requires a GPU (not 'mps').