In [1]:
import string
import re

from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

import torch
import mlflow.pytorch


In [2]:
with open('./1_train_str.txt', 'r', encoding='utf-8') as file:
    train_data = file.readlines()
    
with open('./1_eval_str.txt', 'r', encoding='utf-8') as file:
    eval_data = file.readlines()

In [3]:
def remove_punctuation(doc: str):
    punc = string.punctuation
    punc = punc.replace('|', '')
    punc += '\n\r\t'
    return re.sub(' +', ' ', doc.translate(str.maketrans(punc, ' ' * len(punc))))

In [4]:
clean_train_data = [remove_punctuation(doc) for doc in train_data]
clean_eval_data = [remove_punctuation(doc) for doc in eval_data]

In [5]:
train_df = pd.DataFrame(clean_train_data, columns=['text'])
eval_df = pd.DataFrame(clean_eval_data, columns=['text'])

In [6]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [7]:
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token   

In [8]:
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)
    tokens["labels"] = tokens["input_ids"].copy()  # Utiliser input_ids comme labels
    return tokens

In [9]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11321 [00:00<?, ? examples/s]

Map:   0%|          | 0/2853 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=20,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)



In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with mlflow.start_run() as run:
    trainer.train()
    # Log des paramètres et du modèle dans MLflow
    mlflow.log_params({"model_name": model_name, "epochs": training_args.num_train_epochs})
    #mlflow.pytorch.log_model(model, artifact_path="model", registered_model_name="OurModel")
    mlflow.pytorch.log_model(model, "model")

trainer.save_model('./model')

AttributeError: partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)