# Importation des libraries

In [1]:
import pandas as pd

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma


from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA


import tkinter as tk
from tkinter import scrolledtext
from langchain_ollama import OllamaLLM
import os

# Charger les fichiers CSV

In [None]:

data_1 = pd.read_csv('/home/cytech/test/Rag_covid/data/data_preprocessing/Age_Preprocessing.csv')
data_2 = pd.read_csv('/home/cytech/test/Rag_covid/data/data_preprocessing/Diabetes_Preprocessing.csv')
data_3 = pd.read_csv('/home/cytech/test/Rag_covid/data/data_preprocessing/Overweight_or_obese_Preprocessing.csv')


In [None]:

# Fusionner les données
combined_data = pd.concat([data_1, data_2, data_3], ignore_index=True)

# Fonction segmenter_texte et build_query

In [4]:
# Fonction pour segmenter un document en morceaux de 512 tokens
def segmenter_texte(texte, longueur_max=512):
    tokens = texte.split()
    segments = []
    for i in range(0, len(tokens), longueur_max):
        segment = ' '.join(tokens[i:i + longueur_max])
        segments.append(segment)
    return segments

In [None]:
# Votre instruction par défaut
default_instruction = ("You're an empathetic doctor who knows how to synthesize things so that patients simply understand. You have access to numerous scientific journals (data in the form of embedding). A patient, frightened about his current disease, covid 19, asks you a question about a factor he thinks may or may not be at risk. You have to give him a summary answer, based on the abstracts you have from scientific journals. Simply explain whether or not this factor is a risk in terms of the severity of the virus or its lethality. Don't hesitate to be understanding and gentle. Patients can be stressed and worried. Here's the patient's")

# Fonction pour construire la requête complète en ajoutant l'instruction par défaut
def build_query(user_query):
    return default_instruction + "\n" + user_query

In [7]:
# Appliquer la segmentation à chaque document de la colonne 'context'
documents = combined_data['context'].dropna().tolist()
segmented_docs = []
for doc in documents:
    segmented_docs.extend(segmenter_texte(doc))

# Initialiser le modèle d'embedding avec une dimension réduite

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/static-retrieval-mrl-en-v1",
    model_kwargs={'device': 'cpu', 'truncate_dim': 1024}
)

  embedding_model = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


# Créer l’index Chroma dans un nouveau dossier

In [None]:
persist_dir = "./chroma_static_mrl"
vectorstore = Chroma.from_texts(segmented_docs, embedding_model, persist_directory=persist_dir)


In [10]:
# Créer le système de récupération
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})


# Initialiser le modèle LLaMA

In [None]:
llm = Ollama(model="llama3.2:3b")


  llm = Ollama(model="llama3.2:3b")


In [12]:
# Construire la chaîne QA
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Exemple de requête utilisateur

In [None]:
user_query = "Give me figures on covid and overweight"

# Construction de la requête complète
query = build_query(user_query)

# Exécution de la chaîne QA avec la requête complète
response = qa_chain.run(query)
print(response)

I'm so glad you asked, and I want to assure you that we're going to take a look at some research together to understand more about this topic.

From what I've found in scientific journals, being overweight or obese has been linked to a higher risk of severe illness from COVID-19. Studies have shown that people with a body mass index (BMI) of 30 or higher are more likely to develop pneumonia and acute respiratory distress syndrome (ARDS), which can be serious complications.

In terms of mortality rates, research suggests that people who are overweight or obese may also be at a slightly increased risk of dying from COVID-19. However, it's essential to note that the overall risk is still relatively low, even for those with higher BMIs.

A study published in the New England Journal of Medicine found that among patients hospitalized with COVID-19, those who were obese had a 1.8 times higher risk of death compared to those with a normal weight.

Another study published in the International J

In [None]:
docs = retriever.get_relevant_documents(query)
print("📝 Docs récupérés :")
for d in docs:
    print(d.page_content[:300])  # Affiche les premiers caractères

# --- Partie interface de chat avec historique ---
# Création de la fenêtre principale

In [None]:
root = tk.Tk()
root.title("Chat avec LLM")

# Widget Text pour afficher la conversation
output_text = tk.Text(root, wrap=tk.WORD, height=20, width=80)
output_text.pack(padx=10, pady=10)

# Widget Entry pour saisir le message de l'utilisateur
entry = tk.Entry(root, width=80)
entry.pack(padx=10, pady=(0,10))

# Instruction par défaut à ajouter à chaque requête
default_instruction = ("You're an empathetic doctor who knows how to synthesize things so that patients simply understand. You have access to numerous scientific journals (data in the form of embedding). A patient, frightened about his current disease, covid 19, asks you a question about a factor he thinks may or may not be at risk. You have to give him a summary answer, based on the abstracts you have from scientific journals. Simply explain whether or not this factor is a risk in terms of the severity of the virus or its lethality. Don't hesitate to be understanding and gentle. Patients can be stressed and worried. Here's the patient")

# Initialiser l'historique du chat avec le message système
chat_history = [{"role": "system", "content": default_instruction}]

# Pour le chat, on peut utiliser le même modèle (ou en instancier un nouveau)
llm_chat = llm

def get_ai_response():
    """Récupère la réponse du modèle et garde l'historique."""
    user_input = entry.get().strip()
    if not user_input:
        output_text.insert(tk.END, "Veuillez entrer un message.\n")
        return

    # Ajouter l'entrée utilisateur à l'historique
    chat_history.append({"role": "user", "content": user_input})

    # Construire le prompt formaté en intégrant l'historique complet
    formatted_prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in chat_history])

    # Obtenir la réponse du modèle
    response = llm_chat.invoke(formatted_prompt)

    # Ajouter la réponse de l'assistant à l'historique
    chat_history.append({"role": "assistant", "content": response})

    # Afficher la conversation dans le widget de sortie
    output_text.insert(tk.END, f"Utilisateur : {user_input}\nIA : {response}\n\n")

    # Effacer le champ de saisie
    entry.delete(0, tk.END)

# Bouton pour envoyer le message
send_button = tk.Button(root, text="Envoyer", command=get_ai_response)
send_button.pack(padx=10, pady=(0,10))

# Lancer la boucle principale de l'interface Tkinter
root.mainloop()