modules nécessaire pandas scikit-learn

In [25]:
import pandas as pd
from scipy.ndimage import label
from sklearn.model_selection import train_test_split

# Chargement des données

In [26]:
df = pd.read_csv('../data/faq_data.csv')
X = df['question']
y = df['answer']

# Nettoyage des colonnes 'question' et 'answer' en minuscules et suppression des ponctuations

In [27]:
import string
import re
def remove_punctuation(doc: str):
    punc = string.punctuation
    punc += '\n\r\t'
    return re.sub(' +', ' ', doc.translate(str.maketrans(punc, ' ' * len(punc))))
    

In [28]:
df = df.applymap(str.lower).applymap(remove_punctuation)
df

  df = df.applymap(str.lower).applymap(remove_punctuation)


Unnamed: 0,question,answer
0,what is mlops,mlops is a set of practices to streamline the ...
1,why is mlops important,mlops helps in scaling monitoring and maintain...
2,what is a data drift,data drift is a change in the statistical prop...
3,how does versioning work in mlops,versioning allows you to track and manage diff...
4,what is a model registry,a model registry is a centralized storage for ...
5,how can i monitor a model in production,by tracking metrics like accuracy latency and ...
6,what is the purpose of docker,docker helps in creating reproducible environm...
7,what are ci cd pipelines,ci cd pipelines automate the testing and deplo...
8,why use mlflow,mlflow tracks experiments manages models and f...
9,how does fastapi work with ml models,fastapi allows you to build an api for serving...


# Séparation en ensembles d'entraînement et de test

In [29]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

# Sauvegarde des versions de données
data/train_data.csv
data/test_data.csv

In [30]:
df_train.to_csv('../data/train_data.csv')
df_test.to_csv('../data/test_data.csv')

### Étape 2 : Entraînement du modèle 

Entraînez un modèle de langage de type LLM sur les données d’entraînement. Suivez les expérimentations avec MLflow pour enregistrer les hyperparamètres et les performances du modèle.

### Tâche 1 : Entraînement du modèle

Utilisez un modèle préentraîné (par exemple, GPT-2 de Hugging Face).
Entraînez le modèle sur l'ensemble d'entraînement, en suivant les hyperparamètres et les métriques de performance.
Tâche 2 : Suivi des expérimentations avec MLflow

Créez une nouvelle expérience dans MLflow et enregistrez les paramètres, métriques, et le modèle.

modules nécessaire mlflow transformers[torch] datasets

# Activation mlflow server avec backend-store-uri sqlite default-artifact-root file:/home/$USER/mlops/2024/mlruns ip et port à définir

In [31]:
import mlflow
import mlflow.pytorch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import os

# Désactiver wandb

In [32]:
os.environ['WANDB_DISABLED'] = 'true'

# Charger les données d'entraînement et créer une colonne 'text'

In [33]:
df['text'] = [' '] * len(df)
df

Unnamed: 0,question,answer,text
0,what is mlops,mlops is a set of practices to streamline the ...,
1,why is mlops important,mlops helps in scaling monitoring and maintain...,
2,what is a data drift,data drift is a change in the statistical prop...,
3,how does versioning work in mlops,versioning allows you to track and manage diff...,
4,what is a model registry,a model registry is a centralized storage for ...,
5,how can i monitor a model in production,by tracking metrics like accuracy latency and ...,
6,what is the purpose of docker,docker helps in creating reproducible environm...,
7,what are ci cd pipelines,ci cd pipelines automate the testing and deplo...,
8,why use mlflow,mlflow tracks experiments manages models and f...,
9,how does fastapi work with ml models,fastapi allows you to build an api for serving...,


In [34]:
text_train = pd.read_csv('../data/train_data.csv')
text_train['text'] = text_train['question'] + text_train['answer']
text_train = text_train.drop(columns=['question', 'answer', 'Unnamed: 0'])

# Diviser les données en ensembles d'entraînement et de validation

In [35]:
text_train, text_val = train_test_split(text_train, train_size=0.8, random_state=42)
# X_train, y_train = text_train[:, 0], text_train[:, 1]
# X_val, y_val = text_val[:, 0], text_val[:, 1]

In [36]:
text_train

Unnamed: 0,text
0,how can i monitor a model in production by tra...
7,what is the purpose of docker docker helps in ...
2,what are ci cd pipelines ci cd pipelines autom...
4,how does fastapi work with ml models fastapi a...
3,what is a data drift data drift is a change in...
6,how does versioning work in mlops versioning a...


# Convertir en Dataset et supprimer les colonnes inutiles

In [37]:
train_dataset = Dataset.from_pandas(text_train).remove_columns(["__index_level_0__"])
eval_dataset = Dataset.from_pandas(text_val).remove_columns(["__index_level_0__"])

# Charger le modèle et le tokenizer, et définir un token de remplissage

In [38]:
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokeniser le dataset avec les labels

In [39]:
# with mlflow.start_run():
#     tokenized = tokenizer(train_dataset['text'], padding=True, return_tensors='pt')
#     output = model(**tokenized, labels=tokenized['input_ids'])
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)
    tokens["labels"] = tokens["input_ids"].copy()  # Utiliser input_ids comme labels
    return tokens

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [40]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

# Définir les arguments d'entraînement

In [41]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    remove_unused_columns=False
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Configurer MLflow pour pointer vers votre serveur local

In [42]:
mlflow.set_tracking_uri('../mlruns')
# mlflow_id = mlflow.create_experiment('GPT-2 Chatbot') # A ne pas create si existant deja, mlflow_id == 1
mlflow_id = 1
mlflow.set_experiment('GPT-2 Chatbot')

<Experiment: artifact_location='file:///D:/SchoolWork/M2S3/REDS/M2S3-REDS-MLOps/src/../mlruns/326467713629892773', creation_time=1732186388623, experiment_id='326467713629892773', last_update_time=1732186388623, lifecycle_stage='active', name='GPT-2 Chatbot', tags={}>

# Initialiser le Trainer

In [43]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Entraîner le modèle et enregistrer avec MLflow

In [44]:
with mlflow.start_run() as run:
    trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.641448


  # Log des paramètres et du modèle dans MLflow

In [45]:
mlflow.log_params({"model_name": model_name, "epochs": training_args.num_train_epochs})
mlflow.pytorch.log_model(model, "model")



<mlflow.models.model.ModelInfo at 0x1a1be76c710>

In [46]:
mlflow.pytorch.save_model(model, './model/GPT-2')

