In [None]:
!pip install transformers datasets torch
!pip install -q accelerate -U
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
import pandas as pd
import numpy as np
import re
import os

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


***Chargement et préparation des données***

In [None]:
  from datasets import Dataset, DatasetDict

  file_path = 'BreastCancerChatbotDataset.csv'
  df = pd.read_csv(file_path)

  #Formatage des données
  df['text'] = "Question: " + df['Questions'] + " Réponse: " + df['Answers']
  df = df[['text']]
  df_sample = df.sample(frac=0.1, random_state=42)

  #Conversion en Dataset Hugging Face
  dataset = Dataset.from_pandas(df)
  train_test_split = dataset.train_test_split(test_size=0.2)

  data_train = train_test_split['train']
  data_test = train_test_split['test']
  dataset_dict = DatasetDict({'train': data_train, 'test': data_test})

  #Conversion pour inspection
  train_df = data_train.to_pandas()
  test_df = data_test.to_pandas()

  # Visualiser les premières lignes de l'ensemble d'entraînement
  print("Ensemble d'entraînement :")
  print(train_df.head())

  # Visualiser les premières lignes de l'ensemble de test
  print("Ensemble de test :")
  print(test_df.head())

Ensemble d'entraînement :
                                                text
0  Question: Which medications in the drug class ...
1  Question: What is the role of hormone therapy ...
2  Question: Which family history factors increas...
3  Question: What is the role of age in the etiol...
4  Question: Which physical findings suggest brea...
Ensemble de test :
                                                text
0  Question: Which chemotherapy agents are used i...
1  Question: Which medications in the drug class ...
2  Question: Which organizations have issued guid...
3  Question: What are the NCCN guidelines for axi...
4  Question: According to ASCO guidelines, which ...




---


On prépare un dataset pour le fine-tuning de GPT-2 en formatant les données d’un fichier CSV avec des questions et réponses. Les données sont converties en un format adapté ("Question: [...] Réponse: [...]"), divisées en ensembles d'entraînement et de test (80/20), puis organisées dans un DatasetDict de Hugging Face, prêtes pour l’entraînement.


---



In [None]:
with open('Q_A_train.txt', 'w') as file:
    for _, row in train_df.iterrows():
        text = row['text']
        file.write(f"{text}\n\n")
with open('Q_A_test.txt', 'w') as file:
    for row in data_test:
        text = row['text']
        file.write(f"{text}\n\n")



---

On prépare les données au format texte brut, directement utilisable pour l’entraînement ou la validation du modèle.


---



# **Fine-tuning du modèle GPT-2**


In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [None]:
def load_dataset(file_path, tokenizer, block_size=64):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

def load_data_collator(tokenizer, mlm=False):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )



---
Les fonctions load_dataset et load_data_collator préparent les données pour fine-tuner GPT-2 en formatant le texte en séquences et en le préparant pour l'entraînement. La première charge le fichier texte et divise en blocs, tandis que la seconde organise les données pour l'apprentissage en activant ou non le masquage de langage. Ces étapes sont cruciales pour l'entraînement du modèle.


---




In [None]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps):
    # Charger le tokenizer et le modèle
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Charger le dataset et le collator
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    # Configurer les arguments d'entraînement
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        fp16=True,  # Active la précision mixte pour accélérer
        logging_dir='./logs',  # Répertoire des journaux
        logging_steps=50,  # Fréquence d'enregistrement des journaux
    )
      # Initialiser l'entraîneur
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    # Entraîner le modèle
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)



---


On utilise la fonction train pour fine-tuner le modèle GPT-2 en chargeant le modèle et le tokenizer, préparant les données d'entraînement, et configurant les paramètres d'entraînement comme le nombre d’époques et la taille des batches. On utilise un objet Trainer pour entraîner le modèle et on sauvegarde le modèle fine-tuné ainsi que le tokenizer pour une utilisation future.


---



In [None]:
model_name = 'gpt2'
output_dir = 'Chat_Model/'
overwrite_output_dir = False
per_device_train_batch_size = 16
num_train_epochs = 50
save_steps = 5000

In [None]:
train_file_path = 'Q_A_train.txt'

train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
50,2.8958
100,2.1677
150,1.8074
200,1.4874
250,1.2155
300,0.9854
350,0.7843
400,0.6368
450,0.5143


Step,Training Loss
50,2.8958
100,2.1677
150,1.8074
200,1.4874
250,1.2155
300,0.9854
350,0.7843
400,0.6368
450,0.5143
500,0.4325


In [None]:
!cp -r /content/Chat_Model /content/drive/MyDrive/chatbot/Chat_Model

# ***Chargement et génération de texte avec GPT-2 fine-tuné***

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch # Importing the 'torch' module

def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to("cuda" if torch.cuda.is_available() else "cpu")  # Charger sur GPU si disponible
    return model

def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Charger le modèle et le tokenizer depuis Google Drive
model_path = "/content/drive/MyDrive/chatbot/Chat_Model"
tokenizer_path = "/content/drive/MyDrive/chatbot/Chat_Model"

model = load_model(model_path)
tokenizer = load_tokenizer(tokenizer_path)



---
On charge le modèle GPT-2 et son tokenizer à l’aide des fonctions load_model et load_tokenizer. La fonction load_model charge le modèle pré-entraîné et le place sur le CPU, tandis que load_tokenizer récupère le tokenizer associé, nécessaire pour convertir le texte en tokens et vice versa. Ces étapes sont essentielles pour préparer le modèle avant de générer du texte.


---




In [None]:
def generate_text(model_path, sequence, max_length):
    # Charger le modèle et le tokenizer
    model_path = "/content/drive/MyDrive/chatbot/Chat_Model"
    tokenizer_path = "/content/drive/MyDrive/chatbot/Chat_Model"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    # Tokenizer la séquence d'entrée
    ids = tokenizer.encode(sequence, return_tensors='pt').to("cpu")  # Assurez-vous que les tensors sont sur le GPU

    # Générer le texte
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        top_k=50,
        top_p=0.95,
    )

    # Décoder les prédictions
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)

    # Extraire uniquement la réponse
    # Supposons que la réponse commence après le mot "Réponse:"
    answer_start = generated_text.find("Réponse:") + len("Réponse:")
    answer_text = generated_text[answer_start:].strip()

    # Extraire la réponse jusqu'à la fin ou jusqu'à la prochaine question
    next_question_index = answer_text.find("Question:")
    if next_question_index != -1:
        answer_text = answer_text[:next_question_index].strip()

    return answer_text



---

On utilise la fonction generate_text pour générer du texte à partir d'une séquence d'entrée. Elle commence par charger le modèle et le tokenizer à partir du chemin spécifié. Ensuite, la séquence d'entrée est tokenisée, et le modèle génère une réponse en échantillonnant le texte jusqu'à la longueur maximale définie. Après avoir décodé les prédictions, la réponse est extraite, à partir du mot "Réponse:", et nettoyée pour enlever toute nouvelle question éventuelle. Cette fonction permet de produire une réponse basée sur le modèle fine-tuné.


---



In [None]:
sequences = [
    "Question: What is breast cancer?",
    "How can breast cancer be prevented?",
    "What are the risk factors for breast cancer?"
]
max_len = 50
model_path = "/content/drive/MyDrive/chatbot/Chat_Model"

# Loop through all sequences and generate responses
for sequence in sequences:
    generated_text = generate_text(model_path, sequence, max_len)
    print(f" {sequence}\nGenerated Response: {generated_text}\n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Question: What is breast cancer?
Generated Response: Breast cancers usually are epithelial tumors of ductal or lobular origin. All of the following features are important in deciding on a course of treatment for any breast tumor:

Size
Status

 How can breast cancer be prevented?
Generated Response: One of the most widely studied factors in breast cancer prevention is the use of exogenous hormones in the form of oral contraceptives (OCs) and hormone replacement therapy (HRT).  The overall

 What are the risk factors for breast cancer?
Generated Response: The risk factors for breast cancer are summarized as follows:

Family history of breast cancer
Recent breast cancer, particularly in relation to a first or second trimester delivery
Recent breast



# **Traduction entre Anglais et Darija avec Hugging Face Pipeline**

**Partie 1 :** Se concentre sur la traduction simple de texte entre l'anglais et le Darija.


---



In [None]:
from transformers import pipeline

# Création du pipeline de traduction de l'anglais vers le Darija
def create_translation_pipeline(model_name="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic"):
    pipe = pipeline("text2text-generation", model=model_name, device=0)
    return pipe

In [None]:
# Fonction pour traduire une liste de textes
def translate_multiple_texts(pipe, texts):
    translations = []
    for text in texts:
        translated_text = pipe(text)[0]['generated_text']
        translations.append(translated_text)
    return translations

# Exemple d'utilisation
pipe = create_translation_pipeline()
texts = [
    "What is breast cancer?",
    "How can breast cancer be prevented?",
    "What are the symptoms of breast cancer?"
]
translated_texts = translate_multiple_texts(pipe, texts)

# Affichage des traductions
for original, translated in zip(texts, translated_texts):
    print(f"Original: {original}\nTranslated: {translated}\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/957M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/916k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

Device set to use cpu


Original: What is breast cancer?
Translated: شنو لسرطان ديال لثدي?

Original: How can breast cancer be prevented?
Translated: كيفاش إمكن توقي لسرطان ديال الثدي؟

Original: What are the symptoms of breast cancer?
Translated: اشناهوما لأعراض ديال سرطان ديال الثدي؟





---


Ce code se concentre uniquement sur la traduction de l'anglais vers le Darija en utilisant un modèle de traduction spécifique. Il ne génère pas de réponse mais effectue une simple traduction d'un texte anglais vers le Darija.

---



**Partie 2 :** Ajoute GPT-2 pour générer une réponse à une question en anglais, puis traduit la question et la réponse en Darija.


---



In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load both models and tokenizers
model_path = "/content/drive/MyDrive/chatbot/Chat_Model"  # Use the full path to your saved model
tokenizer_path = "/content/drive/MyDrive/chatbot/Chat_Model"  # Use the full path to your saved tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained(model_path).to("cpu")  # Load from the full path
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)  # Load from the full path

translation_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)  # If translation model needs GPU, keep device=0


# Function to generate answers and translate them
def handle_input(input_text):
    # Step 1: Generate the answer using GPT-2
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt').to("cpu")  # Changed to CPU

    # Generate the answer (note: `stop_sequence` removed)
    output = gpt2_model.generate(
        input_ids,
        max_length=50,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        num_return_sequences=1,
        early_stopping=True  # Enable early stopping to reduce over-generation
    )
    generated_answer = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer after "Réponse:" without including unintended extra questions
    answer_start = generated_answer.find("Réponse:") + len("Réponse:")
    answer_text = generated_answer[answer_start:].split("Question:")[0].strip()  # Ensure splitting at the next "Question:"

    # Step 2: Translate the original question and the generated answer into Darija
    translated_question = translation_pipe(input_text)[0]['generated_text']
    translated_answer = translation_pipe(answer_text)[0]['generated_text']

    # Return both the original answer and the translated version
    return {
        "answer": answer_text,
        "translated_question": translated_question,
        "translated_answer": translated_answer
    }

Device set to use cpu


In [None]:
# Entrée de l'utilisateur
user_input = "What is breast cancer?"

# Fonction pour traiter l'entrée utilisateur, générer une réponse et effectuer la traduction
result = handle_input(user_input)

# Affichage des résultats
print("Answer:", result["answer"])  # Affiche la réponse générée par le modèle GPT-2
print("Translated Question:", result["translated_question"])  # Affiche la question traduite en Darija
print("Translated Answer:", result["translated_answer"])  # Affiche la réponse traduite en Darija



Answer: Breast cancers usually are epithelial tumors of ductal or lobular origin. All of the following features are important in deciding on a course of treatment for any breast tumor:

Size
Status of surgical
Translated Question: شنو لسرطان ديال لثدي?
Translated Answer: عادة مايكونو لسرطان ديال لثدي أورام طليلية من أصل قناهوي ولا فصامي. اع لخصائص لي تليا مهمة فاش تقرر العلاج لأي ديال لورم ديال لثدي: الحجم الحالة ديال الجراحة


In [None]:
from transformers import pipeline

pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
#  Translate the word "salam" from Darija to English
translation = pipe("salam")

# Output the result
print(translation[0]['translation_text'])
# Translate the word "salam" from Darija to English
translation = pipe("سلام كيداير لباس عليك")

# Output the result
print(translation[0]['translation_text'])
# Translate the word "salam" from Darija to English
translation = pipe("شنو سرطان ديال لثدي   " )

# Output the result
print(translation[0]['translation_text'])

hello
hey how are you
what's a breast cancer




---
Ce code introduit GPT-2 pour générer des réponses à partir d'une question en anglais. Une fois la réponse générée, la question et la réponse sont traduites en Darija. Cela permet de combiner la génération de texte et la traduction.


---




**Partie 3 :** Prend une entrée en Darija, la traduit en anglais, génère une réponse avec GPT-2 en anglais, puis traduit cette réponse en Darija.


---



In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load models and tokenizers
model_path = "/content/drive/MyDrive/chatbot/Chat_Model"  # Use the full path to your saved model
tokenizer_path = "/content/drive/MyDrive/chatbot/Chat_Model"  # Use the full path to your saved tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained(model_path).to("cpu")  # Load from the full path
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)  # Load from the full path

# Pipeline for Darija to English translation
darija_to_english_pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2", device=0)

# Pipeline for English to Darija translation
english_to_darija_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Function to handle input in Darija
def handle_input_darija(input_text):
    # Step 1: Translate the input question from Darija to English
    translated_question = darija_to_english_pipe(input_text)[0]['translation_text']

    # Step 2: Generate the answer using GPT-2
    input_ids = gpt2_tokenizer.encode(f"Question: {translated_question}", return_tensors='pt').to("cpu")

    output = gpt2_model.generate(
        input_ids,
        max_length=50,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        num_return_sequences=1,
        early_stopping=True
    )
    generated_answer = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer after "Réponse:" and clean extra questions
    answer_start = generated_answer.find("Réponse:") + len("Réponse:")
    answer_text = generated_answer[answer_start:].split("Question:")[0].strip()

    # Step 3: Translate the generated answer from English back to Darija
    translated_answer = english_to_darija_pipe(answer_text)[0]['generated_text']

    # Return the original question and translated answer
    return {
        "original_question_in_darija": input_text,
        "translated_question_in_english": translated_question,
        "generated_answer_in_english": answer_text,
        "translated_answer_in_darija": translated_answer
    }

Device set to use cpu
Device set to use cpu


In [None]:
# Liste de questions en Darija à tester
darija_inputs = [
    "شنو أعراض ديال سرطان لثدي؟",
    "كيفاش نقدر نتفادى سرطان الثدي؟",
    "أشنو الأسباب لي كتزيد خطر الإصابة بسرطان الثدي؟",
    "شنو هي الفحوصات لي يمكن تديرها باش تكتشف السرطان؟",
    "شحال من مرة خاصني ندير فحص ديال الثدي؟"
]

# Traitement et affichage des résultats pour chaque question
for darija_input in darija_inputs:
    result = handle_input_darija(darija_input)

    print("\nOriginal Question (Darija):", result["original_question_in_darija"])
    print("Translated Question (English):", result["translated_question_in_english"])
    print("Generated Answer (English):", result["generated_answer_in_english"])
    print("Translated Answer (Darija):", result["translated_answer_in_darija"])


Original Question (Darija): شنو أعراض ديال سرطان لثدي؟
Translated Question (English): What are the symptoms of breast cancer?
Generated Answer (English): The following are typical features of breast cancer:

Hardness
Irregularity
Focal nodularity
Fixation to skin or muscle
Translated Answer (Darija): هادشي هي السمات النمطية لسرطان الثدي: الصلادة عدم الانتظام العقدة لبشرة ولا لعضلة

Original Question (Darija): كيفاش نقدر نتفادى سرطان الثدي؟
Translated Question (English): How can we avoid breast cancer?
Generated Answer (English): Many early breast carcinomas are asymptomatic; pain and discomfort are not usually a symptom of the disease. Pain and discomfort are not usually a symptom of breast cancer; only 5
Translated Answer (Darija): بزّاف ديال لورما ماعندوش أعراض; ألم أُلام مزعجة مايكونوش عادة من أعراض المرض. ألم أُلام مزعجة مايكونوش عادة من أعراض لسرطان ديال الثدي; غير 5

Original Question (Darija): أشنو الأسباب لي كتزيد خطر الإصابة بسرطان الثدي؟
Translated Question (English): What c



---
On prend des entrées en Darija, les traduit en anglais, génère une réponse en anglais avec GPT-2, puis traduit la réponse de l'anglais vers le Darija. Cette approche est inversée par rapport à la première partie, car elle commence par une entrée en Darija et effectue une traduction dans les deux sens (Darija → Anglais et Anglais → Darija).


---




# **INTERFACE**

In [2]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
%%writefile app.py

import streamlit as st
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Charger les modèles et les tokenizers
model_path = "/content/drive/MyDrive/chatbot/Chat_Model"
tokenizer_path = "/content/drive/MyDrive/chatbot/Chat_Model"

# Vérification des chemins
import os
assert os.path.exists(model_path), "Le chemin du modèle GPT-2 est incorrect."
assert os.path.exists(tokenizer_path), "Le chemin du tokenizer GPT-2 est incorrect."

# Chargement du modèle et du tokenizer GPT-2
gpt2_model = GPT2LMHeadModel.from_pretrained(model_path).to("cpu")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)

# Initialisation des pipelines de traduction
darija_to_english_pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2", device=0)
english_to_darija_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Fonction pour générer une réponse avec GPT-2
def generate_answer_with_gpt2(input_text):
    input_ids = gpt2_tokenizer.encode(f"Question: {input_text}", return_tensors="pt").to("cpu")

    output = gpt2_model.generate(
        input_ids,
        max_length=50,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        num_return_sequences=1,
        early_stopping=True
    )
    generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extraire la réponse après "Réponse:"
    answer_start = generated_text.find("Réponse:") + len("Réponse:")
    answer_text = generated_text[answer_start:].strip()
    return answer_text

# Fonction pour gérer les entrées en Darija
def handle_input_darija(darija_input):
    try:
        # Traduction de la question en anglais
        translated_question_in_english = darija_to_english_pipe(darija_input)[0]['translation_text']

        # Génération de la réponse en anglais
        generated_answer_in_english = generate_answer_with_gpt2(translated_question_in_english)

        # Traduction de la réponse en Darija
        translated_answer_in_darija = english_to_darija_pipe(generated_answer_in_english)[0]['generated_text']
    except Exception as e:
        return {
            "error": f"Une erreur est survenue : {str(e)}",
            "original_question_in_darija": darija_input
        }

    return {
        "original_question_in_darija": darija_input,
        "translated_question_in_english": translated_question_in_english,
        "generated_answer_in_english": generated_answer_in_english,
        "translated_answer_in_darija": translated_answer_in_darija
    }

# Interface Streamlit
st.title("Chatbot Breast Cancer")

st.write(
    """
    Merhaba bik f chatbot b darija. Ktoub so2al b darija w ghadi twsel bjawab <3.
    """
)

# Saisie de la question en Darija
darija_input = st.text_input("Entrez votre question en Darija :")

if darija_input:
    result = handle_input_darija(darija_input)

    if "error" in result:
        st.error(result["error"])
    else:
        # Affichage des résultats
        st.subheader("Résultats :")
        st.write("### Question originale (Darija) :")
        st.write(result["original_question_in_darija"])

        st.write("### Question traduite (Anglais) :")
        st.write(result["translated_question_in_english"])

        st.write("### Réponse générée (Anglais) :")
        st.write(result["generated_answer_in_english"])

        st.write("### Réponse traduite (Darija) :")
        st.write(result["translated_answer_in_darija"])

Writing app.py


In [4]:
!ngrok authtoken 2pQzGunRmrtFYBwmdLBMt0omse9_3suGg9RTh6Bm5HH97gQiP # Replace <your_authtoken> with your actual authtoken

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [5]:
from pyngrok import ngrok
import os

# Créer un tunnel public
tunnel = ngrok.connect(8501)
print("Streamlit app is live at:", tunnel.public_url)

# Démarrer Streamlit
os.system("streamlit run app.py &")

Streamlit app is live at: https://b97d-34-68-165-151.ngrok-free.app


0