In [8]:
!pip install transformers gradio pymupdf #used dependencies



In [9]:
from transformers import pipeline

# Generative QA pipeline
gen_qa = pipeline("text2text-generation", model="google/flan-t5-base", max_length=256)

# New context: Transformers in NLP
context = """
Transformers are a type of deep learning architecture introduced in the paper "Attention is All You Need" by Vaswani et al. in 2017.
They have revolutionized natural language processing (NLP) by enabling models to process entire sequences of text at once using self-attention mechanisms, instead of relying on sequential processing like RNNs or LSTMs.

A key innovation in transformers is the attention mechanism, which allows the model to weigh the relevance of different words in a sequence, regardless of their position. This results in better understanding of context and relationships in text.

Transformers form the foundation of models such as BERT, GPT, T5, and RoBERTa, which are pre-trained on large corpora and then fine-tuned for specific tasks like sentiment analysis, text classification, translation, and question answering.

Because of their scalability and performance, transformers have also been adopted in fields beyond NLP, such as computer vision, protein folding, and generative modeling.
"""

# New questions
questions = [
    "What is a transformer in NLP?",
    "Who introduced the transformer model and when?",
    "What is the attention mechanism in transformers?",
    "Which models are based on the transformer architecture?",
    "Are transformers used outside of NLP?"
]

# Ask each question
for question in questions:
    prompt = f"Answer the question based on the context:\nContext: {context}\nQuestion: {question}"
    answer = gen_qa(prompt)[0]['generated_text']
    print(f"Q: {question}")
    print(f"A: {answer}\n")


Device set to use cpu


Q: What is a transformer in NLP?
A: enabling models to process entire sequences of text at once using self-attention mechanisms, instead of relying on sequential processing like RNNs or LSTMs

Q: Who introduced the transformer model and when?
A: Vaswani et al.

Q: What is the attention mechanism in transformers?
A: allows the model to weigh the relevance of different words in a sequence, regardless of their position

Q: Which models are based on the transformer architecture?
A: BERT, GPT, T5, and RoBERTa

Q: Are transformers used outside of NLP?
A: transformers have also been adopted in fields beyond NLP



In [13]:
import gradio as gr
import fitz  # PyMuPDF
import re
from transformers import pipeline
import math

# Multilingual QA pipeline
qa_models = {
    "English": "deepset/roberta-base-squad2",
    "Multilingual": "deepset/xlm-roberta-large-squad2",
}

# Clean text extracted from PDFs or TXT
def clean_text(text):
    text = text.replace('\n', ' ')  # Flatten line breaks
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    return text.strip()

# Extract text from PDF or TXT file
def extract_text(file):
    if file is None:
        return ""
    fname = file.name
    if fname.endswith(".pdf"):
        try:
            doc = fitz.open(fname)
            text = ""
            for page in doc:
                text += page.get_text()
            return clean_text(text)
        except Exception as e:
            return f"⚠️ PDF extraction failed: {str(e)}"
    elif fname.endswith(".txt"):
        try:
            with open(fname, "r", encoding="utf-8") as f:
                return clean_text(f.read())
        except Exception as e:
            return f"⚠️ TXT read failed: {str(e)}"
    else:
        return "⚠️ Unsupported file type. Please upload PDF or TXT."

# Simple tokenizer splitter by words to chunk text (approx 400 tokens)
def chunk_text(text, chunk_size=400, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = words[start:end]
        chunks.append(" ".join(chunk))
        start += chunk_size - overlap  # overlap to maintain context continuity
    return chunks

# Load QA pipeline dynamically depending on language choice
def get_qa_pipeline(language):
    model_name = qa_models.get(language, "deepset/roberta-base-squad2")
    return pipeline("question-answering", model=model_name)

# Answer question based on file(s) with chunking for long texts
def answer_files_question(files, question, language):
    if not files or not question.strip():
        return "⚠️ Please upload at least one file and enter a question."

    combined_context = ""
    for file in files:
        text = extract_text(file)
        if text.startswith("⚠️"):
            return text
        combined_context += text + " "

    combined_context = combined_context.strip()
    if not combined_context:
        return "⚠️ Uploaded files are empty or couldn't extract text."

    try:
        qa = get_qa_pipeline(language)
        chunks = chunk_text(combined_context, chunk_size=400, overlap=50)

        best_answer = None
        best_score = -math.inf
        for chunk in chunks:
            result = qa(question=question, context=chunk)
            if result['score'] > best_score:
                best_score = result['score']
                best_answer = result['answer']

        answer = f"### Answer:\n{best_answer}\n\n*Confidence: {best_score:.2f}*"
        return answer
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

# Answer question based on pasted context text with chunking
def answer_context_question(context, question, language):
    if not context.strip() or not question.strip():
        return "⚠️ Please enter context and a question."
    try:
        cleaned_context = clean_text(context)
        qa = get_qa_pipeline(language)
        chunks = chunk_text(cleaned_context, chunk_size=400, overlap=50)

        best_answer = None
        best_score = -math.inf
        for chunk in chunks:
            result = qa(question=question, context=chunk)
            if result['score'] > best_score:
                best_score = result['score']
                best_answer = result['answer']

        answer = f"### Answer:\n{best_answer}\n\n*Confidence: {best_score:.2f}*"
        return answer
    except Exception as e:
        return f"⚠️ Error: {str(e)}"

# Gradio UI (unchanged from your original)
with gr.Blocks(theme=gr.themes.Default()) as demo:
    gr.HTML("<script>document.title = 'DocDigest Multilingual';</script>")
    gr.Markdown("<h2 style='text-align:center; color:#4A90E2;'>📚 DocDigest - Multilingual QA & Multi-File Support</h2>")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### Upload one or more PDF/TXT files")
            file_input = gr.File(label="Upload PDF or TXT files", file_types=['.pdf', '.txt'], file_count="multiple")
            file_question_input = gr.Textbox(label="Ask a question about the uploaded files", placeholder="Type your question here...", lines=2)
            file_language = gr.Dropdown(choices=list(qa_models.keys()), value="English", label="Select Language for QA Model")
            file_submit_btn = gr.Button("Get Answer for Files")
            file_answer_output = gr.Markdown()

        with gr.Column():
            gr.Markdown("### Or paste context text here")
            context_input = gr.Textbox(label="Paste context text", placeholder="Paste your text here...", lines=10)
            context_question_input = gr.Textbox(label="Ask a question about the context", placeholder="Type your question here...", lines=2)
            context_language = gr.Dropdown(choices=list(qa_models.keys()), value="English", label="Select Language for QA Model")
            context_submit_btn = gr.Button("Get Answer for Context")
            context_answer_output = gr.Markdown()

    file_submit_btn.click(fn=answer_files_question, inputs=[file_input, file_question_input, file_language], outputs=file_answer_output)
    context_submit_btn.click(fn=answer_context_question, inputs=[context_input, context_question_input, context_language], outputs=context_answer_output)

    gr.HTML(
        """
        <style>
        body {
            background-color: #f0f4f8;
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        }
        #file_input input[type="file"] {
            width: 100% !important;
            padding: 8px;
            border-radius: 6px;
            border: 1px solid #ccc;
            box-sizing: border-box;
        }
        #file_question_input, #context_input, #context_question_input {
            width: 100% !important;
            font-size: 14px !important;
            border-radius: 6px !important;
            border: 1px solid #ccc !important;
            padding: 8px !important;
            box-sizing: border-box;
        }
        #file_submit_btn button, #context_submit_btn button {
            background-color: #4A90E2 !important;
            color: white !important;
            font-weight: 600 !important;
            font-size: 16px !important;
            border-radius: 6px !important;
            padding: 10px 20px !important;
            margin-top: 10px !important;
            width: 100% !important;
            border: none !important;
            cursor: pointer !important;
            transition: background-color 0.3s ease;
        }
        #file_submit_btn button:hover, #context_submit_btn button:hover {
            background-color: #357ABD !important;
        }
        #file_answer_output, #context_answer_output {
            background-color: #fff !important;
            padding: 15px !important;
            border-radius: 6px !important;
            min-height: 120px !important;
            font-size: 15px !important;
            color: #333 !important;
            white-space: pre-wrap !important;
            border: 1px solid #ddd !important;
            margin-top: 10px !important;
        }
        </style>
        """
    )

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://28737dc382458dbdd1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


