In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/customer-support/cleaned_customer_service_dataset_ready.jsonl


### **IMPORTS**

In [None]:
pip install evaluate

In [2]:
# Preprocessing
import re
import string
import shutil
from sklearn.utils import resample
from collections import Counter
from IPython.display import FileLink

# Normalisation
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem.wordnet import WordNetLemmatizer

# Processing
import torch
from datasets import Dataset
from transformers import Trainer
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# Evaluation
import math
import random
import evaluate

In [3]:
# # Df preprocessed v1
# file_path = '/kaggle/input/customer-support/cleaned_customer_service_dataset_ready.jsonl'
# data = pd.read_json(file_path, lines=True)

# Données brutes (hugging face)
df = pd.read_csv("hf://datasets/bitext/Bitext-customer-support-llm-chatbot-training-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv")

# pandas_df = pd.DataFrame(df)

In [4]:
# Nombre d'exemples par catégorie
category_counts = Counter(df['category'])
print(category_counts)

Counter({'ACCOUNT': 5986, 'ORDER': 3988, 'REFUND': 2992, 'INVOICE': 1999, 'CONTACT': 1999, 'PAYMENT': 1998, 'FEEDBACK': 1997, 'DELIVERY': 1994, 'SHIPPING': 1970, 'SUBSCRIPTION': 999, 'CANCEL': 950})


In [5]:
df.columns

Index(['flags', 'instruction', 'category', 'intent', 'response'], dtype='object')

### **STRATIFIED SAMPLING**
> Echantillonnage strat des données

In [6]:
# Nombre total d'exemples souhaités
total_samples = 2000

# Proportion d'exemples par catégorie
category_counts = df['category'].value_counts()
category_proportions = (category_counts / category_counts.sum())
print(f"proportion pour chaque catégorie : {category_proportions.values}")

# Nombre d'exemples pour chaque catégorie
samples_per_category = (category_proportions * total_samples).astype(int)

# Échantillonnage
balanced_df = pd.DataFrame()
for category, n_samples in samples_per_category.items():
    category_data = df[df['category'] == category]
    sampled_data = category_data.sample(n=min(n_samples, len(category_data)), random_state=42)
    balanced_df = pd.concat([balanced_df, sampled_data])

balanced_df = shuffle(balanced_df, random_state=42)
print(balanced_df['category'].value_counts())

proportion pour chaque catégorie : [0.22275975 0.14840726 0.11134266 0.0743897  0.0743897  0.07435249
 0.07431527 0.07420363 0.07331051 0.03717624 0.03535278]
category
ACCOUNT         445
ORDER           296
REFUND          222
DELIVERY        148
FEEDBACK        148
INVOICE         148
PAYMENT         148
CONTACT         148
SHIPPING        146
SUBSCRIPTION     74
CANCEL           70
Name: count, dtype: int64


In [7]:
# Sauvegarder les données réduites
balanced_df.to_json('reduced_dataset.jsonl', orient='records', lines=True)

In [8]:
balanced_df.query("category == 'ORDER'")

Unnamed: 0,flags,instruction,category,intent,response
70,BZ,I need assistancde cancelling order {{Order Nu...,ORDER,cancel_order,I've noticed that you require assistance with ...
25107,BKL,checking purchase {{Order Number}} current status,ORDER,track_order,Thanks for getting in touch! I grasp that you ...
1731,BL,I need to correct purchase {{Order Number}},ORDER,change_order,Thank you for informing us about the need to c...
1001,BCIQ,need to change order {{Order Number}} how to d...,ORDER,change_order,We've received your message to us for assistan...
25130,BIQ,where do i check the status of order {{Order N...,ORDER,track_order,I hear you! I'm following you that you would l...
...,...,...,...,...,...
1552,BLMQ,i need help to delete some products from order...,ORDER,change_order,We understand that you would like assistance i...
25197,BK,check status of order,ORDER,track_order,Thank you for reaching out! I'm here to help y...
1798,BILMQ,how do i remove several articles from order {{...,ORDER,change_order,We're here for you to us for assistance with r...
1003,BK,remove item from order {{Order Number}},ORDER,change_order,I've decoded that you need assistance with rem...


In [9]:
balanced_df.shape

(1993, 5)

### **PREPROCESSING**

In [10]:
def clean_text(text):
    text = text.strip()  # Supprime les espaces superflus
    text = re.sub(r'\s+', ' ', text)  # Remplace plusieurs espaces par un seul
    return text

In [11]:
# Drop NaN
balanced_df = balanced_df.dropna(subset=["instruction", "response"])

# Clean
balanced_df["instruction"] = balanced_df["instruction"].apply(clean_text)
balanced_df["response"] = balanced_df["response"].apply(clean_text)
balanced_df["text"] = balanced_df["category"] + ": " + balanced_df["instruction"] + " " + balanced_df["response"]  # structure les données

In [55]:
# balanced_df['text'].head(1)

25957    REFUND: I'm waiting for a reimbursement of 120...
Name: text, dtype: object

In [56]:
balanced_df

Unnamed: 0,flags,instruction,category,intent,response,text
25957,BLQZ,I'm waiting for a reimbursement of 1200dollars,REFUND,track_refund,I'm fully aware of how important it is for you...,REFUND: I'm waiting for a reimbursement of 120...
12854,BCILMQ,wanna see what shipment methods i can choose h...,DELIVERY,delivery_options,Of course! I'm thrilled to assist you in explo...,DELIVERY: wanna see what shipment methods i ca...
20492,BILQ,what do i have to do to retrieve my account pin,ACCOUNT,recover_password,I'm on your side your concern about retrieving...,ACCOUNT: what do i have to do to retrieve my a...
12385,BCILP,"I have to see the shipping options, could I ge...",DELIVERY,delivery_options,I'll take care of it! I'm here to guide you th...,"DELIVERY: I have to see the shipping options, ..."
26829,BLQW,I'm waiting for a goddamn compensation of {{Re...,REFUND,track_refund,I can relate to the understanding that you are...,REFUND: I'm waiting for a goddamn compensation...
...,...,...,...,...,...,...
9426,BLZ,I don't know how to speak with a humnan agent,CONTACT,contact_human_agent,Thank you for trusting us! I'm fully aware of ...,CONTACT: I don't know how to speak with a humn...
18153,BLMZ,help to solve an error withonline payments,PAYMENT,payment_issue,I appreciate you reaching out for assistance i...,PAYMENT: help to solve an error withonline pay...
26107,BLQ,i expect a reimbursement of {{Currency Symbol}...,REFUND,track_refund,We understand the importance of your expectati...,REFUND: i expect a reimbursement of {{Currency...
7800,BL,I call to lodge a customer complaint against y...,FEEDBACK,complaint,I'm sorry to hear that you're calling to lodge...,FEEDBACK: I call to lodge a customer complaint...


### **TRAIN - TEST SPLIT**

In [12]:
# Diviser en ensembles train/val/test
train_data, test_data = train_test_split(balanced_df, test_size=0.2, stratify=balanced_df["category"], random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, stratify=train_data["category"], random_state=42)

### **TOKENIZATION & ADJUSTMENT**

In [13]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")  

# tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Utilise le token de fin de séquence (eos_token) comme token de padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

**V1**

In [14]:
# def preprocess_data(examples):
#     input_texts = examples["instruction"]
#     output_texts = examples["response"]
#     combined_texts = [f"{input_text}{output_text}" for input_text, output_text in zip(input_texts, output_texts)]

#     # Tokenisation
#     tokenized = tokenizer(
#         combined_texts,
#         truncation=True,
#         padding="max_length",
#         max_length=256,      # a ajuster 
#         return_tensors="pt"
#     )
#     return tokenized

In [15]:
# # Conversion en Dataset Hugging face
# dataset = Dataset.from_pandas(balanced_df)

# # Prétraiter le dataset
# tokenized_dataset = dataset.map(preprocess_data, batched=True)

In [16]:
# tokenized_dataset[0]

**V2**

In [17]:
# Tokenisation
train_encodings = tokenizer(list(train_data["text"]), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(list(val_data["text"]), truncation=True, padding=True, max_length=256)

In [18]:
# train_encodings

In [19]:
# DataCollator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

### **PROCESSING**

In [20]:
model_name = "EleutherAI/gpt-neo-125M"  # Modèle plus petit
model = AutoModelForCausalLM.from_pretrained(model_name)

# model = AutoModelForCausalLM.from_pretrained("gpt2") 

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [21]:
# Utilisation du GPU pour accélérer l'entrainement
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [22]:
# Transforme les données en Dataset Hugging Face
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_encodings["input_ids"]  # Les labels sont les mêmes que les input_ids pour causal LM
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_encodings["input_ids"]
})

In [23]:
# Hyperparams de l'entrainement
training_args = TrainingArguments(
    output_dir="./gpt_neo_ft_2",  # Sauvegarde du modèle
    evaluation_strategy="no",  # Évaluation à la fin de chaque epoch / sinon "no"
    learning_rate=5e-5,           
    per_device_train_batch_size=4,  # Taille des batches d'entraînement
    per_device_eval_batch_size=4,   # Taille des batches d'évaluation
    num_train_epochs=3,          
    weight_decay=0.01,           # Régularisation L2
    save_strategy="no",          # Sauvegarde du modèle à chaque epoch
    logging_dir="./logs",        # Dossier pour les logs
    logging_steps=10,            # Intervalle de logging
    # save_total_limit=2,          # Sauvegarder seulement les 2 derniers checkpoints
    # load_best_model_at_end=True, # Charger le meilleur modèle basé sur la métrique d'évaluation
    fp16=True,                   # Utilisation de la précision mixte pour accélérer l'entraînement sur GPU
)




In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,  # pour gérer le padding
    tokenizer=tokenizer,         
)

  trainer = Trainer(


In [25]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,1.2894,1.27085
2,1.0771,1.159278
3,1.0212,1.136602


TrainOutput(global_step=540, training_loss=1.2514916208055284, metrics={'train_runtime': 251.9816, 'train_samples_per_second': 17.073, 'train_steps_per_second': 2.143, 'total_flos': 561856466386944.0, 'train_loss': 1.2514916208055284, 'epoch': 3.0})

### **EVALUATION**

In [26]:
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")

Evaluation Results: {'eval_loss': 1.1366021633148193, 'eval_runtime': 3.3593, 'eval_samples_per_second': 47.629, 'eval_steps_per_second': 5.954, 'epoch': 3.0}


In [27]:
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity}")

Perplexity: 3.1161621462331133


### **SAVE THE MODEL**

In [28]:
trainer.save_model("./gpt_neo_ft_2")
tokenizer.save_pretrained("./gpt_neo_ft_2")

('./gpt_neo_ft_2/tokenizer_config.json',
 './gpt_neo_ft_2/special_tokens_map.json',
 './gpt_neo_ft_2/vocab.json',
 './gpt_neo_ft_2/merges.txt',
 './gpt_neo_ft_2/added_tokens.json',
 './gpt_neo_ft_2/tokenizer.json')

In [32]:
%cd /kaggle/working

/kaggle/working


In [34]:
# Crée un fichier zip contenant tous les fichiers du dossier fine_tuned_model
output_dir = "./gpt_neo_ft_2"

shutil.make_archive("gpt_neo_ft_2", 'zip', output_dir)

FileLink(r'gpt_neo_ft_2.zip')

In [39]:
# Libère la mémoire GPU inutilisée
torch.cuda.empty_cache()

### **TEST**

**V1**

In [35]:
# Charge le modèle fine-tuné
generator = pipeline("text-generation", model="./gpt_neo_ft_2", tokenizer=tokenizer)

# Générer une réponse
prompt = "I need assistance with my order."
response = generator(prompt, max_length=50, num_return_sequences=1)
print(response[0]["generated_text"])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


I need assistance with my order. I'm here to assist you with the necessary steps to make your order from {{Order Number}}. To get started, could you please provide me with the specific details of your order? This will allow me to locate


**V2**

In [64]:
def chatbot_response(question, model, tokenizer, device):
    """
    Génère une réponse à une question en utilisant le modèle fine-tuné.
    
    Args:
        question (str): La question à poser au chatbot.
        model: Le modèle fine-tuné.
        tokenizer: Le tokenizer utilisé avec le modèle.
        device: L'appareil utilisé pour l'inférence (GPU ou CPU).

    Returns:
        str: La réponse générée par le modèle.
    """
    input_text = f"question: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=256).input_ids.to(device)

    # Génère la réponse avec le modèle
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,     # Nombre maximum de tokens générés
            # max_length=100,         # Longueur maximale de la réponse
            num_beams=7,            # Beam search pour générer des réponses de meilleure qualité
            early_stopping=True,    # Arrêter si toutes les séquences sont complètes
            do_sample=True,         # Ajouter de la diversité dans la génération
            temperature=0.7,        # Contrôle de la probabilité pour la génération
            top_k=50,               # Limite des prédictions au top-k tokens
            top_p=0.9,              # Nucleus sampling (p-probability mass)
            repetition_penalty=1.2  # Réduction des répétitions dans la génération
        )

    # Décode les tokens en texte
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [47]:
# Exemple de question pour le chatbot
question = "How to change delivery address?"
response = chatbot_response(question, model, tokenizer, device)
print("Chatbot response:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Chatbot response: question: How to change delivery address? Thank you for reaching out! I'm here to guide you through the process of changing your delivery address. To change your delivery address, you can follow these steps: 1. Log in to your account on our website. 2. Navigate to the 'My Account' or 'Profile' section. 3. Look for the 'Shipping Addresses' or similar option. 4. Click on the 'Add a New Address' or 'Edit Shipping Addresses'


### **VISUALIZE METRICS**

In [None]:
pip install rouge_score

In [None]:
pip install bert_score

In [62]:
def generate_comparison_responses(eval_dataset, model, tokenizer, device, num_samples=5):
    """
    Génère des réponses pour des échantillons aléatoires et compare avec les réponses réelles.

    Args:
        eval_dataset (Dataset): Dataset contenant les questions et réponses réelles.
        model: Le modèle fine-tuné.
        tokenizer: Le tokenizer utilisé avec le modèle.
        device: L'appareil utilisé pour l'inférence (GPU ou CPU).
        num_samples (int): Nombre d'échantillons à évaluer.

    Returns:
        pd.DataFrame: Un tableau comparant les réponses réelles et prédictions.
    """
    random_indices = random.sample(range(len(eval_dataset)), num_samples)
    results = []

    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")
    bertscore_metric = evaluate.load("bertscore")

    for idx in random_indices:
        sample = eval_dataset[idx]
        
        # Décoder la question (input_ids) et la réponse réelle (labels)
        question = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
        true_response = tokenizer.decode(sample["labels"], skip_special_tokens=True)
        
        # Générer la réponse prédite
        predicted_response = chatbot_response(question, model, tokenizer, device)

        # Calcul des métriques
        decoded_preds = [predicted_response.strip()]
        decoded_labels = [[true_response.strip()]]

        bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)["bleu"]

        rouge = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

        bertscore = bertscore_metric.compute(predictions=decoded_preds, references=decoded_labels, lang="en")["f1"]

        results.append({
            "question": question,
            "true_response": true_response,
            "predicted_response": predicted_response,
            "bleu": bleu,
            "rouge": rouge,
            "bertscore": bertscore[0]
        })

    df = pd.DataFrame(results)
    
    return df

In [65]:
# Génère un rapport comparatif pour des échantillons aléatoires
num_samples = 5  
df_res = generate_comparison_responses(val_dataset, model, tokenizer, device, num_samples=num_samples)

# Afficher les résultats
pd.set_option('display.max_colwidth', None)
print(df_res.head(num_samples))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eo

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      question  \
0                                                                                                                                                                                                                     