importations



In [None]:
pip install transformers datasets

In [None]:
!pip install --upgrade transformers


conversion de  fichier input_output.json vers un fichier d'entrainement format jsonl


In [None]:
from google.colab import files

# Upload the file
uploaded = files.upload()

In [None]:
with open("input_output_pairs.json", "r") as f:
    pairs = json.load(f)

# Convert input_output_pairs.json to training_data.jsonl
with open("training_data.jsonl", "w", encoding="utf-8") as f:
    for pair in pairs:
        json_line = json.dumps({"prompt": pair["input"], "completion": pair["output"]}, ensure_ascii=False)
        f.write(json_line + "\n")

print("Data converted to 'training_data.jsonl'.")


fine-tuning de modèle GPT2 Tokenizer avec le fichier training.jsonl


In [None]:
from google.colab import drive
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import json

with open("input_output_pairs.json", "r") as f:
    pairs = json.load(f)

# Convert input_output_pairs.json to training_data.jsonl
with open("training_data.jsonl", "w", encoding="utf-8") as f:
    for pair in pairs:
        json_line = json.dumps({"prompt": pair["input"], "completion": pair["output"]}, ensure_ascii=False)
        f.write(json_line + "\n")

print("Data converted to 'training_data.jsonl'.")

# Load model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Load dataset
dataset = Dataset.from_json("training_data.jsonl")
dataset = dataset.train_test_split(test_size=0.1)  # 10% validation split

# Preprocess data
def preprocess_function(examples):
    prompts = examples["prompt"]
    completions = examples["completion"]
    combined = [f"{prompt} ### {completion}" for prompt, completion in zip(prompts, completions)]
    tokenized = tokenizer(
        combined,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Define training arguments (disable W&B)
training_args = TrainingArguments(
    output_dir="./fine_tuned_model_chatbot",  # Output directory for model checkpoints
    overwrite_output_dir=True,
    num_train_epochs=50,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    report_to="none",  # Disable W&B
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],  # Optional: If you want evaluation
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer locally
model.save_pretrained("./fine_tuned_model_chatbot")
tokenizer.save_pretrained("./fine_tuned_model_chatbot")

print("Fine-tuned model saved locally in './fine_tuned_model_chatbot'.")


push de modele vers huggingface

In [None]:
# Étape 6 : Authentification Hugging Face (nécessaire une seule fois)
from huggingface_hub import notebook_login
notebook_login()  #

In [None]:
from huggingface_hub import create_repo

repo_name = "fine_tuned_model_chatbot"  # 🔁 choisis ton propre nom
create_repo(repo_name, private=True)  # `private=False` si tu veux qu’il soit public


In [None]:
model.push_to_hub("fine_tuned_model_chatbot")
tokenizer.push_to_hub("fine_tuned_model_chatbot")


In [None]:
# 🚀 INSTALLATIONS (si nécessaire sur Colab)
!pip install transformers pydantic fuzzywuzzy python-Levenshtein unidecode

# 🚀 IMPORTS
from transformers import AutoTokenizer, AutoModelForCausalLM
from pydantic import BaseModel
from fuzzywuzzy import fuzz
import torch
import unidecode
import re
import string
import json

# 🚀 Chargement du modèle fine-tuné
# Remplace par ton chemin si besoin (par exemple: "/content/mon_modele/")
model_name = "toumix/fine_tuned_model_chatbot"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# (Optionnel) Utiliser le GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# ✅ Fonction de nettoyage / normalisation
def normalize(text):
    text = unidecode.unidecode(text.lower())
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

# ✅ Fonction de matching flou
def fuzzy_match_any(text: str, choices: list, threshold: int = 70):
    return any(fuzz.partial_ratio(text, choice) >= threshold for choice in choices)

# ✅ Charger le dictionnaire médical
# Assurez-vous que le fichier JSON est présent dans Colab : /content/separate_dictionaries_updated.json
with open("/content/separate_dictionaries_updated.json", "r") as f:
    medical_dict = json.load(f)

# ✅ Normaliser maladies et symptômes
disease_names = [normalize(disease) for disease in medical_dict.get("disease_dict", [])]
symptom_names = [
    normalize(sym) for symptoms in medical_dict.get("symptoms_dict", {}).values()
    for sym in symptoms
]

# ✅ Mots-clés indicateurs (normalisés)
keywords_by_type = {
    "symptoms": [
        "symptom", "symptoms", "sign", "signs", "disorder", "illness", "condition", "i feel",
        "i have", "i’m experiencing", "i am experiencing", "i suffer from", "i feel pain",
        "my body hurts", "i feel unwell", "i feel bad", "i am not feeling well",
        "what am i experiencing", "what is wrong with me", "i'm feeling", "something is wrong with"
    ],
    "precaution": [
        "prevent", "prevention", "how to avoid", "how can i avoid", "how can i prevent",
        "protection", "protect", "how to stay safe", "stay safe", "risk reduction",
        "precaution", "precautions", "how to reduce risk", "safety measures",
        "avoid getting", "what to do to avoid", "how to be careful", "what should i do to protect"
    ],
    "diet": [
        "diet", "nutrition", "eating", "meal", "meals", "what to eat", "what should i eat",
        "what food", "what can i eat", "dietary", "foods to avoid", "diet plan",
        "recommend food", "avoid food", "nutritional advice", "what is allowed to eat",
        "diet recommendation", "can i eat", "healthy food", "food habits"
    ],
    "medication": [
        "medication", "medicine", "drug", "treatment", "remedy", "therapy",
        "what can i take", "what should i take", "what is prescribed", "prescription",
        "dosage", "dose", "cure", "take for", "pill", "tablet", "pharmaceutical",
        "what is the best treatment", "medications", "how to treat", "how to cure"
    ]
}

# ✅ Fonction de validation intelligente
def is_valid_medical_question(prompt: str) -> bool:
    prompt_clean = normalize(prompt)

    # Tous les mots-clés à plat et normalisés
    all_keywords = sum(keywords_by_type.values(), [])
    all_keywords_clean = [normalize(k) for k in all_keywords]

    keyword_found = (
        any(kw in prompt_clean for kw in all_keywords_clean) or
        fuzzy_match_any(prompt_clean, all_keywords_clean, threshold=70)
    )

    medical_terms_clean = disease_names + symptom_names
    medical_term_found = (
        any(term in prompt_clean for term in medical_terms_clean) or
        fuzzy_match_any(prompt_clean, medical_terms_clean, threshold=70)
    )

    return keyword_found and medical_term_found

# ✅ Modèle de requête
class ChatRequest(BaseModel):
    question: str

# ✅ Génération de la réponse
def generate_response(prompt: str) -> str:
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Mettre sur GPU si dispo
    outputs = model.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ✅ Fonction chat
def chat(request: ChatRequest):
    if not is_valid_medical_question(request.question):
        return {
            "response": "❌ Sorry, I only answer medical questions that contain a valid keyword and a known disease or symptom."
        }
    return {"response": generate_response(request.question)}

# ✅ TESTER LE CHAT
# Exemple d'appel
user_question = "I have a sore throat and a cough, what should I take?"
request = ChatRequest(question=user_question)
response = chat(request)
print(response["response"])


prédiction avec le modele déja préentrainé

In [None]:
model.save_pretrained("my_medical_chatbot_model")
tokenizer.save_pretrained("my_medical_chatbot_model")


In [None]:
import shutil
from IPython.display import FileLink

# Créer l'archive
shutil.make_archive("my_medical_chatbot_model", 'zip', "my_medical_chatbot_model")

# Générer un lien pour télécharger l'archive
FileLink("my_medical_chatbot_model.zip")


In [None]:
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer

# Sauvegarder le modèle et le tokenizer dans un seul fichier
with open("medical_chatbot_model.pkl", "wb") as f:
    pickle.dump({
        "model": model,
        "tokenizer": tokenizer
    }, f)
