In [3]:
import os
import pickle
import gradio as gr
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import torch

FAISS_FILE = "faiss_store.pkl"

# Load text generation pipeline using a lightweight T5 model
device = 0 if torch.cuda.is_available() else -1
hf_pipeline = pipeline(
    "text2text-generation",
    model="MBZUAI/LaMini-Flan-T5-783M",
    device=device,
    max_new_tokens=256,
    do_sample=False,
    temperature=0
)
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Custom prompt template for focused and honest answers
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful assistant. Answer the question strictly using the context. 
If the answer is not about the exact car model mentioned in the question, say "I don't know".

Context: {context}

Question: {question}
Answer:
"""
)

# Function to load and process news articles from URLs
def process_urls(url1, url2, url3, progress=gr.Progress()):
    urls = [url1, url2, url3]
    loader = UnstructuredURLLoader(urls=urls)

    progress(0.1, desc="Loading articles")
    docs = loader.load()

    # Basic cleaning to remove unwanted boilerplate from news content
    def clean_text(text):
        import re
        return re.sub(r"(Remove Ad|Story continues below.*?|Reuters|Advertisement)", "", text, flags=re.IGNORECASE)

    for doc in docs:
        doc.page_content = clean_text(doc.page_content)

    # Split documents into manageable chunks for embedding
    progress(0.4, desc="Splitting content into chunks")
    splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=80)
    splits = splitter.split_documents(docs)

    # Create vector embeddings using a pre-trained MiniLM model
    progress(0.6, desc="Generating embeddings")
    embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(splits, embedding)

    # Save the vectorstore to disk
    progress(0.9, desc="Saving processed data")
    with open(FAISS_FILE, "wb") as f:
        pickle.dump(vectorstore, f)

    progress(1.0, desc="Completed processing")
    return "Articles processed and saved successfully."

# Function to query the processed articles using the LLM
def answer_query(question):
    if not os.path.exists(FAISS_FILE):
        return "Please process the articles first.", ""

    with open(FAISS_FILE, "rb") as f:
        vectorstore = pickle.load(f)

    retriever = vectorstore.as_retriever(search_type="similarity", k=3)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": custom_prompt}
    )

    result = qa_chain.invoke({"query": question})
    answer = result["result"]
    sources = "\n".join(set(doc.metadata["source"] for doc in result["source_documents"]))
    return answer, sources

# Gradio UI
with gr.Blocks(
    css="""
    #title-container {
        text-align: center;
        padding: 2rem 1rem;
        background: linear-gradient(135deg, #eef6ff, #ffffff);
        border-radius: 20px;
        margin-bottom: 25px;
        box-shadow: 0 6px 24px rgba(0, 0, 0, 0.06);
    }

    #title-container h1 {
        font-size: 3rem;
        font-weight: 800;
        color: #1a365d;
        margin-bottom: 0.4rem;
    }

    #title-container p {
        font-size: 1.2rem;
        color: #2d3748;
        max-width: 820px;
        margin: 0 auto;
        line-height: 1.7;
    }

    .tab-panel {
        background-color: #ffffff;
        padding: 24px;
        border-radius: 16px;
        box-shadow: 0 2px 12px rgba(0, 0, 0, 0.05);
        margin-top: 10px;
    }

    .gr-button {
        background: linear-gradient(135deg, #3182ce, #63b3ed) !important;
        color: white !important;
        font-weight: bold;
        border-radius: 8px !important;
        padding: 10px 20px !important;
    }

    .gr-textbox label {
        font-weight: 600;
        color: #1a202c;
    }

    .gr-textbox textarea, .gr-textbox input {
        border-radius: 6px !important;
        border: 1px solid #cbd5e0 !important;
    }
    """,
    theme=gr.themes.Default(primary_hue="blue")
) as demo:

    gr.Markdown("""
        <div id="title-container">
            <h1>🧠 SmortBot</h1>
            <p>
                Turn news into knowledge in seconds! Paste article links, ask your questions, and get smart answers with sources. <br>
                Powered by LangChain, HuggingFace, and FAISS — all wrapped in a user-friendly Gradio interface.
            </p>
        </div>
    """)

    with gr.Tab("🔗 Step 1: Load Article URLs"):
        with gr.Row(elem_classes="tab-panel"):
            with gr.Column():
                url1 = gr.Textbox(label="🔍 News URL 1", placeholder="Paste article link here...", lines=1)
                url2 = gr.Textbox(label="🔍 News URL 2", placeholder="Optional second link...", lines=1)
                url3 = gr.Textbox(label="🔍 News URL 3", placeholder="Optional third link...", lines=1)
                process_btn = gr.Button("🚀 Process Articles")
                process_status = gr.Textbox(label="📄 Status", interactive=False)

        process_btn.click(process_urls, inputs=[url1, url2, url3], outputs=process_status)

    with gr.Tab("❓ Step 2: Ask a Question"):
        with gr.Row(elem_classes="tab-panel"):
            question = gr.Textbox(label="🤔 Your Question", placeholder="Ask something related to the articles", lines=2)
        with gr.Row(elem_classes="tab-panel"):
            answer = gr.Textbox(label="💡 SmortBot’s Answer", lines=4, interactive=False)
        with gr.Row(elem_classes="tab-panel"):
            sources = gr.Textbox(label="🔗 Sources Used", lines=3, interactive=False)
        ask_btn = gr.Button("💬 Get Answer")
        ask_btn.click(answer_query, inputs=question, outputs=[answer, sources])

demo.launch()




* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


