<a href="https://colab.research.google.com/github/Mohamed-Hassan-81429315/AI_Project/blob/main/Chat_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [156]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import torch
import re

In [157]:
class ReliableArabicRAGChatbot:
    """
    Professional Arabic Data-Driven Chatbot using Sentence Transformers.
    Optimized for Excel-based institutional datasets.
    """

    def __init__(self, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
        self.model = SentenceTransformer(model_name)
        self.data = None
        self.embeddings = None

        self.knowledge_columns = [
            "اسم الجهة",
            "اسم المسوول",
            "الية التعاون",
            "حالة التعاون",
            "ملاحظات"
        ]

    # --------------------------------------------------
    # Text Cleaning
    # --------------------------------------------------
    def preprocess_text(self, text):
        if not isinstance(text, str):
            return ""

        text = text.strip().lower()

        text = re.sub(r"[\u064B-\u0652]", "", text)

        text = re.sub(r"[إأآ]", "ا", text)
        text = re.sub(r"ى", "ي", text)
        text = re.sub(r"ة", "ه", text)

        text = re.sub(r"[^0-9a-zA-Z\u0600-\u06FF\s]", " ", text)
        text = re.sub(r"\s+", " ", text)

        return text.strip()

    # --------------------------------------------------
    # Load & Validate Dataset
    # --------------------------------------------------
    def load_dataset(self, file_path):
        try:
            df = pd.read_excel(file_path)

            df.columns = df.columns.str.strip()

            if "الية التعاون " in df.columns:
                df.rename(columns={"الية التعاون ": "الية التعاون"}, inplace=True)


            df.dropna(how="all", inplace=True)


            df = df[self.knowledge_columns]


            df.fillna("غير متوفر", inplace=True)

            df["answer_text"] = df.apply(
                lambda row: f"""
اسم الجهة: {row['اسم الجهة']}
اسم المسئول: {row['اسم المسوول']}
آلية التعاون: {row['الية التعاون']}
حالة التعاون: {row['حالة التعاون']}
ملاحظات: {row['ملاحظات']}
""".strip(),
                axis=1
            )


            df["clean_text"] = df["answer_text"].apply(self.preprocess_text)

            self.data = df.reset_index(drop=True)

            print(f"✅ Dataset loaded successfully: {len(self.data)} records")
            return True

        except Exception as e:
            print(f" Dataset loading failed: {e}")
            return False

    # --------------------------------------------------
    # Build Vector Store
    # --------------------------------------------------
    def build_vector_database(self):
        if self.data is None:
            raise ValueError("Load dataset first.")

        print(" Generating embeddings...")
        self.embeddings = self.model.encode(
            self.data["clean_text"].tolist(),
            convert_to_tensor=True,
            show_progress_bar=True
        )
        print(" Vector database ready")

    # --------------------------------------------------
    # Chat / Retrieval
    # --------------------------------------------------
    def chat(self, query, threshold=0.45):
        query_clean = self.preprocess_text(query)
        query_embedding = self.model.encode(query_clean, convert_to_tensor=True)

        hits = util.semantic_search(
            query_embedding,
            self.embeddings,
            top_k=1
        )

        best_hit = hits[0][0]
        score = best_hit["score"]
        idx = best_hit["corpus_id"]

        if score < threshold:
            return " لا توجد بيانات دقيقة للإجابة على هذا السؤال."

        return self.data.iloc[idx]["answer_text"]



In [158]:
bot = ReliableArabicRAGChatbot()

data_path = "/content/sample.xlsx"

if bot.load_dataset(data_path):
    bot.build_vector_database()

    query = "ما هي ملاحظات التعاون مع الجهة السناب؟"
    print("\n Question:", query)
    print(" Answer:\n", bot.chat(query))


✅ Dataset loaded successfully: 4 records
 Generating embeddings...


  df.fillna("غير متوفر", inplace=True)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Vector database ready

 Question: ما هي ملاحظات التعاون مع الجهة السناب؟
 Answer:
  لا توجد بيانات دقيقة للإجابة على هذا السؤال.


In [159]:
# df.info()

In [160]:
# df.columns = df.columns.str.strip()
# df = df.dropna(how="all").fillna("غير متوفر")

In [161]:
# def get_best_match(question):
#     q_vec = vectorizer.transform([question])
#     similarity = (X @ q_vec.T).toarray()
#     idx = similarity.argmax()
#     score = similarity[idx][0]
#     return idx, score