# **Ynet_QA_RAG_ChatBot**

In [None]:
!pip install gradio langchain langchain-community beautifulsoup4 transformers faiss-cpu sentence-transformers

In [2]:
import gradio as gr
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Global variables
vectorstore = None
articles = []
titles_text = ""

def scrape_articles_from_rss(rss_url="https://www.ynet.co.il/Integration/StoryRss2.xml", max_articles=5):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(rss_url, headers=headers)
    root = ET.fromstring(response.content)
    items = root.findall(".//item")[:max_articles]

    articles = []
    for item in items:
        link = item.find("link").text
        title = item.find("title").text if item.find("title") is not None else "ללא כותרת"
        try:
            article_resp = requests.get(link, headers=headers)
            soup = BeautifulSoup(article_resp.content, 'html.parser')
            script = soup.find("script", type="application/ld+json")
            article_body = ""
            if script:
                try:
                    data = json.loads(script.string)
                    if isinstance(data, dict) and "articleBody" in data:
                        article_body = data["articleBody"]
                except:
                    pass
            if not article_body:
                paragraphs = soup.select("article p") or soup.find_all("p")
                article_body = ' '.join(p.get_text(strip=True) for p in paragraphs)
            if len(article_body.strip()) > 200:
                articles.append({'url': link, 'text': article_body, 'title': title})
            else:
                print(f"⚠️ Skipped short article: {link}")
        except Exception as e:
            print(f"❌ Error scraping {link}: {e}")
    return articles

def initialize_index():
    global vectorstore, articles, titles_text

    print("🔄 Scraping articles...")
    articles = scrape_articles_from_rss(max_articles=5)
    print(f"✅ Scraped {len(articles)} articles.")

    docs = [Document(page_content=a['text'], metadata={'url': a['url'], 'title': a['title']}) for a in articles]
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    split_docs = splitter.split_documents(docs)

    print("🔄 Creating embeddings...")
    embedding_model = HuggingFaceEmbeddings(model_name="avichr/heBERT")
    vectorstore = FAISS.from_documents(split_docs, embedding_model)
    print("✅ Vectorstore ready.")

    titles_text = "\n".join(f"• {a['title']}" for a in articles)

def load_qa_model():
    model_name = "deepset/xlm-roberta-large-squad2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa_pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
    return qa_pipe

def answer_question_gradio(query):
    related_docs = vectorstore.similarity_search(query, k=1)
    if not related_docs:
        return "לא נמצאו מאמרים רלוונטיים.", ""
    context = related_docs[0].page_content
    url = related_docs[0].metadata.get('url', '')

    result = qa_pipe({
        "question": query,
        "context": context[:1000]
    })

    answer = result['answer']
    return answer, url

def refresh_articles():
    initialize_index()
    return titles_text

# Initial setup
initialize_index()
qa_pipe = load_qa_model()

# UI
with gr.Blocks() as demo:
    gr.Markdown("<h2 style='text-align:right; direction: rtl;'>❓ שאלות ותשובות על חדשות Ynet בעברית</h2>")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("<h3 style='text-align:right; direction: rtl;'>כותרות המאמרים:</h3>")
            titles_box = gr.Textbox(value=titles_text, label="", interactive=False, lines=10, elem_id="titles_box")
            refresh_btn = gr.Button("רענן חדשות")

        with gr.Column(scale=2):
            query_input = gr.Textbox(label="הקלד את שאלתך כאן", lines=3, elem_id="query_input")
            answer_output = gr.Textbox(label="תשובה", lines=5, elem_id="answer_output", interactive=False)
            source_output = gr.Textbox(label="כתובת המקור של המאמר", interactive=False, elem_id="source_output")
            submit_btn = gr.Button("שאל")

    submit_btn.click(
        fn=answer_question_gradio,
        inputs=[query_input],
        outputs=[answer_output, source_output]
    )

    refresh_btn.click(
        fn=refresh_articles,
        inputs=[],
        outputs=[titles_box]
    )

    demo.css = """
    #query_input textarea, #answer_output textarea, #source_output textarea, #titles_box textarea {
        direction: rtl !important;
        text-align: right !important;
        font-size: 18px !important;
        font-family: Arial, sans-serif !important;
    }
    label {
        text-align: right !important;
        direction: rtl !important;
        width: 100%;
        font-size: 18px !important;
        font-family: Arial, sans-serif !important;
    }
    #query_input, #answer_output, #source_output, #titles_box {
        margin-left: auto !important;
        margin-right: 0 !important;
        display: block !important;
        width: 100% !important;
    }
    """

if __name__ == "__main__":
    demo.launch()
