<a href="https://colab.research.google.com/github/MdAfzalkhan20112000/EHRS-USING-LLM/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# 📄 Lightweight CPU File Summarizer
# ===============================

!pip install gradio PyPDF2 python-docx

import os, io, tempfile
import PyPDF2, docx
import gradio as gr
from collections import Counter
import re
import heapq

# -------------------------------
# File Readers
# -------------------------------
def read_txt(path_or_bytes):
    if isinstance(path_or_bytes, (bytes, bytearray)):
        return path_or_bytes.decode("utf-8", errors="ignore")
    with open(path_or_bytes, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_pdf(path_or_bytes):
    text = []
    if isinstance(path_or_bytes, (bytes, bytearray)):
        reader = PyPDF2.PdfReader(io.BytesIO(path_or_bytes))
    else:
        reader = PyPDF2.PdfReader(path_or_bytes)
    for page in reader.pages:
        text.append(page.extract_text() or "")
    return "\n".join(text)

def read_docx(path_or_bytes):
    if isinstance(path_or_bytes, (bytes, bytearray)):
        with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
            tmp.write(path_or_bytes)
            tmp.flush()
            tmp_name = tmp.name
        doc = docx.Document(tmp_name)
        os.unlink(tmp_name)
    else:
        doc = docx.Document(path_or_bytes)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_file(file_obj):
    fname = getattr(file_obj, "name", None)
    if fname and os.path.exists(fname):
        if fname.lower().endswith(".pdf"):
            return read_pdf(fname)
        elif fname.lower().endswith(".docx"):
            return read_docx(fname)
        elif fname.lower().endswith(".txt"):
            return read_txt(fname)
        else:
            return read_txt(fname)
    data = getattr(file_obj, "read", lambda: file_obj)()
    if isinstance(data, (bytes, bytearray)):
        if data[:4] == b"%PDF":
            return read_pdf(data)
        if data[:2] == b"PK":
            return read_docx(data)
        return read_txt(data)
    return read_txt(str(data))

# -------------------------------
# Simple Frequency-Based Summarizer
# -------------------------------
def summarize_text(text, max_sentences=5):
    # Clean text
    text = re.sub(r"\s+", " ", text)
    sentences = re.split(r"(?<=[.!?]) +", text)

    if len(sentences) <= max_sentences:
        return "⚠️ Text too short for summarization:\n\n" + text

    # Word frequencies
    words = re.findall(r"\w+", text.lower())
    freq = Counter(words)

    # Score sentences
    sentence_scores = {}
    for sent in sentences:
        sentence_words = re.findall(r"\w+", sent.lower())
        score = sum(freq[w] for w in sentence_words if w in freq)
        sentence_scores[sent] = score

    # Pick top sentences
    summary_sentences = heapq.nlargest(max_sentences, sentence_scores, key=sentence_scores.get)
    summary = " ".join(summary_sentences)
    return summary

# -------------------------------
# Gradio Interface
# -------------------------------
def process_file(file):
    try:
        text = extract_text_from_file(file)
        return summarize_text(text)
    except Exception as e:
        return f"❌ Error reading file: {e}"

title = "📄 CPU-based File Summarizer"
desc = "Upload a .txt, .pdf, or .docx file. Works fully on CPU (no GPU needed)."

demo = gr.Interface(
    fn=process_file,
    inputs=gr.File(label="Upload File", file_types=[".txt", ".pdf", ".docx"]),
    outputs=gr.Textbox(label="Summary", lines=12),
    title=title,
    description=desc
)

# ✅ Works in Colab with share=True
demo.launch(share=True)
