In [1]:
import os
from dotenv import load_dotenv
import requests
from collections import Counter
import re
from PyPDF2 import PdfReader
import docx2txt
from pathlib import Path

# Universal Secret Loader

In [None]:

def load_secret(key):
    env_var = os.getenv(key)
    if env_var:
        return env_var
    raise ValueError(f"Key '{key}' not found in environment variables")

# Load all required API keys and URLs

In [None]:

OCR_API_URL = load_secret("OCR_API_URL")
OCR_API_KEY = load_secret("OCR_API_KEY")
NER_API_URL = load_secret("NER_API_URL")
NER_API_KEY = load_secret("NER_API_KEY")
SUMMARY_API_URL = load_secret("SUMMARY_API_URL")
SUMMARY_API_KEY = load_secret("SUMMARY_API_KEY")
CLASSIFY_API_URL = load_secret("CLASSIFY_API_URL")
CLASSIFY_API_KEY = load_secret("CLASSIFY_API_KEY")

# Set headers

In [None]:

ner_headers = {"Authorization": f"Bearer {NER_API_KEY}"}
sum_headers = {"Authorization": f"Bearer {SUMMARY_API_KEY}"}
classify_headers = {"Authorization": f"Bearer {CLASSIFY_API_KEY}"}

# 2. OCR Function (Image to Text)

In [None]:

def extract_text_from_image(path):
    with open(path, 'rb') as f:
        response = requests.post(
            OCR_API_URL,
            files={'filename': f},
            data={'apikey': OCR_API_KEY, 'language': 'eng'}
        )
    try:
        result = response.json()
        return result['ParsedResults'][0]['ParsedText']
    except Exception as e:
        return f"OCR Error: {str(e)}"

# 3. Text Extractor

In [None]:

def extract_text(path):
    file_ext = Path(path).suffix.lower()
    if file_ext == '.pdf':
        reader = PdfReader(path)
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    elif file_ext == '.docx':
        return docx2txt.process(path)
    elif file_ext == '.txt':
        with open(path, 'r', encoding='utf-8') as f:
            return f.read()
    elif file_ext in ['.jpg', '.jpeg', '.png']:
        return extract_text_from_image(path)
    return "Unsupported file format."

# 4. NER API Integration

In [None]:

def call_ner_api(text):
    payload = {"inputs": text}
    response = requests.post(NER_API_URL, headers=ner_headers, json=payload)
    return response.json()

# 5. Summarizer API Integration

In [None]:

def call_summarizer_api(text, min_len=40, max_len=150):
    payload = {
        "inputs": text,
        "parameters": {"min_length": min_len, "max_length": max_len, "do_sample": False}
    }
    response = requests.post(SUMMARY_API_URL, headers=sum_headers, json=payload)
    return response.json()

# 6. Classifier API Integration

In [None]:

def query_zero_shot(text, labels):
    payload = {"inputs": text, "parameters": {"candidate_labels": labels}}
    response = requests.post(CLASSIFY_API_URL, headers=classify_headers, json=payload)
    return response.json()

# 7. Metadata Generator Functions

In [None]:
def extract_title(text):
    lines = text.strip().split("\n")
    lines = [line.strip() for line in lines if line.strip()]
    for line in lines:
        if 5 < len(line) < 100 and line[0].isupper():
            return line
    return lines[0] if lines else "Unknown Title"

def extract_author(text):
    lines = text.strip().split('\n')
    for line in lines:
        match = re.search(r"\b(By|Written by|Author:)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)", line.strip())
        if match:
            return match.group(2)
    results = call_ner_api(text)
    try:
        for ent in results[0]['entities']:
            if ent.get("entity_group", ent.get("entity")) == "PER":
                return ent.get("word")
    except:
        pass
    return "Unknown Author"

def detect_category_from_chunks(text, candidate_labels=None, max_chunk_size=1000):
    if candidate_labels is None:
        candidate_labels = ["Finance", "Health", "Education", "Politics", "Technology", "History", "Philosophy", "Biography", "Science", "Fiction"]
    chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    scores = Counter()
    for chunk in chunks:
        result = query_zero_shot(chunk, candidate_labels)
        if isinstance(result, dict) and "labels" in result and "scores" in result:
            scores[result['labels'][0]] += result['scores'][0]
    best_category = scores.most_common(1)[0][0] if scores else "Unknown"
    return best_category, dict(scores)

def generate_summary(text, min_len=40, max_len=150):
    chunks = [text[i:i+3000] for i in range(0, len(text), 3000)]
    final_summary = ""
    for chunk in chunks:
        response = call_summarizer_api(chunk, min_len, max_len)
        try:
            summary = response[0]['summary_text']
            final_summary += summary.strip() + " "
        except:
            final_summary += "[Summary Failed] "
    return final_summary.strip()

def generate_metadata(text):
    metadata = {
        "title": extract_title(text),
        "author": extract_author(text),
        "summary": generate_summary(text),
        "category": None,
        "category_scores": {}
    }
    metadata["category"], metadata["category_scores"] = detect_category_from_chunks(text)
    return metadata

# Example Usage

In [12]:
base_dir = 'data'

In [14]:
cd ..

c:\Users\Harshal Sharma\Desktop\meta-gen


In [16]:
files = os.listdir(base_dir)
files

['uploads']

In [19]:
file = os.listdir(os.path.join(base_dir, files[0]))
file

['.keep', '1750787978_Design_Fabrication_and_Flight_Demonstration_of_a_R.pdf']

In [24]:
example_path = os.path.join(os.path.join(base_dir, files[0]), file[1])

In [25]:
example_path

'data\\uploads\\1750787978_Design_Fabrication_and_Flight_Demonstration_of_a_R.pdf'

In [26]:
text = extract_text(example_path)

In [27]:
metadata = generate_metadata(text)

In [28]:
metadata

{'title': 'J. Aerosp. Technol. Manag., São José dos Campos, Vol.6, No 1, pp.19-27, Jan.-Mar., 2014',
 'author': 'Unknown Author',
 'summary': 'A remotely controlled airship was designed, Fabricated and demonstrated within a tight timespan of under a month after receiving the go-ahead. The main design requirement was to be able to operate from a Helipad located at an altitude of 6,572 feet AMSL under ISA+20 deg.C. Images of the terrain below were recorded during the flight and transmitted in real-time to a ground based system using an onboard telemetry system. Airships are aerodynamically shaped bodies filled with a “Lighter-Than-Air” (LTA) gas that displaces the ambient air, which results in a net upward force due to buoyancy. A remotely controlled (RC) airship is perhaps much more suitable than a remotely controlled aircraft for aerial surveillance due to its long endurance loiter and lower fuel consumption. The RC airship was designed to carry a camera that could take high resolution

In [29]:
metadata['summary']

'A remotely controlled airship was designed, Fabricated and demonstrated within a tight timespan of under a month after receiving the go-ahead. The main design requirement was to be able to operate from a Helipad located at an altitude of 6,572 feet AMSL under ISA+20 deg.C. Images of the terrain below were recorded during the flight and transmitted in real-time to a ground based system using an onboard telemetry system. Airships are aerodynamically shaped bodies filled with a “Lighter-Than-Air” (LTA) gas that displaces the ambient air, which results in a net upward force due to buoyancy. A remotely controlled (RC) airship is perhaps much more suitable than a remotely controlled aircraft for aerial surveillance due to its long endurance loiter and lower fuel consumption. The RC airship was designed to carry a camera that could take high resolution photographs of the terrain below and transmit them in-real time. It had to operate in very cold weather at a high altitude and the envelope m