In [None]:
# 📌 1. Install dependencies (Run in Colab)
!pip install flask flask-cors pyngrok transformers sentencepiece pymupdf

# 📌 2. Imports
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch, os, fitz
from dotenv import load_dotenv


load_dotenv()
NGROK_TOKEN = os.getenv("NGROK_AUTH_TOKEN")

# 📌 3. Load models
device = "cuda" if torch.cuda.is_available() else "cpu"
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large").to(device)

# 📌 4. Flask setup
app = Flask(__name__)
CORS(app)
UPLOAD_FOLDER = "/content/uploads"
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# 📌 5. PDF text extractor
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "".join([page.get_text() for page in doc])

# 📌 6. Text chunking
def chunk_text(text, max_words=800):
    words = text.split()
    return [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]

# 📌 7. Summarizer
def summarize_with_bart(text):
    chunks = chunk_text(text)
    summaries = []
    for i, chunk in enumerate(chunks[:5]):
        inputs = bart_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=1024).to(device)
        summary_ids = bart_model.generate(
            inputs.input_ids,
            max_length=300,
            min_length=100,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
        decoded = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(decoded.strip())
    return "\n\n---\n\n".join(summaries)

# 📌 8. Q&A generator
def generate_qna(text):
    chunks = chunk_text(text, max_words=400)
    all_qas = []
    q_index = 1
    for chunk in chunks[:3]:
        prompt = f"""
You are an academic assistant.

From the following content, generate exactly ONE factual question and answer.

Format:
Q{q_index}: [question]
A{q_index}: [answer]

Text:
{chunk}

Generate now:
"""
        input_ids = t5_tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
        output_ids = t5_model.generate(input_ids, max_length=512, num_beams=4)
        decoded = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        decoded = decoded.replace("Q1:", f"Q{q_index}:").replace("A1:", f"A{q_index}:")
        all_qas.append(decoded)
        q_index += 1
    return "\n\n".join(all_qas)

# 📌 9. PDF Processing
@app.route("/process", methods=["POST"])
def process_pdf():
    if 'file' not in request.files:
        return jsonify({"error": "No file provided"}), 400
    file = request.files['file']
    save_path = os.path.join(UPLOAD_FOLDER, file.filename)
    file.save(save_path)

    try:
        text = extract_text_from_pdf(save_path)
        summary = summarize_with_bart(text)
        qa = generate_qna(text)
        return jsonify({"summary": summary, "qa": qa})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# 📌 10. Text Summary
@app.route("/process_text", methods=["POST"])
def process_text():
    data = request.get_json()
    if not data or 'text' not in data:
        return jsonify({"error": "No text provided"}), 400

    try:
        text = data['text']
        summary = summarize_with_bart(text)
        qa = generate_qna(text)
        return jsonify({"summary": summary, "qa": qa})
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# 📌 11. Start Ngrok
ngrok_token = os.getenv("NGROK_AUTH_TOKEN")
public_url = ngrok.connect(5000)
print("🔗 Public URL:", public_url)

# 📌 12. Run Flask
app.run(port=5000)


Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.12-py3-none-any.whl.metadata (9.4 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Downloading pyngrok-7.2.12-py3-none-any.whl (26 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pymupdf, flask-cors
Successfully installed flask-cors-6.0.1 pymupdf-1.26.3 pyngrok-7.2.12


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

🔗 Public URL: NgrokTunnel: "https://6a8e69c3cc2c.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [26/Jul/2025 12:57:13] "OPTIONS /process_text HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [26/Jul/2025 12:57:19] "POST /process_text HTTP/1.1" 200 -
