## **MARS PROJECT : AUTOMATED METADATA GENERATION**

### By Karan Sardar ( 22118035 ) MT 4th Yr

### Installation of Important Libraries

In [None]:
!pip install groq pytesseract python-docx pdf2image langdetect spacy PyPDF2 pymupdf scikit-learn --quiet
!python -m spacy download en_core_web_sm --quiet

Collecting groq
  Downloading groq-0.28.0-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.28.0-py3-none-any.whl (130 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.28.0


### Importing libraries

In [46]:
import os
from groq import Groq
import pytesseract
from PIL import Image
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import docx
from langdetect import detect
import spacy
import json
import fitz
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import re

### Uploading File in txt, docx or pdf format

In [47]:
from google.colab import files
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

Saving Biobank-Cohort- sustainability guidelines.pdf to Biobank-Cohort- sustainability guidelines (1).pdf


### Groq Setup

In [48]:
GROQ_API_KEY = "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"
client = Groq(api_key=GROQ_API_KEY)
nlp = spacy.load("en_core_web_sm")


### Text Extraction from file

In [49]:
def extract_text_from_file(file_path):
    ext = file_path.split('.')[-1].lower()
    text = ""

    if ext == 'pdf':
        try:
            doc = fitz.open(file_path)
            for page in doc:
                text += page.get_text()
            doc.close()

            if len(text.strip()) < 50:
                images = convert_from_path(file_path)
                for image in images:
                    text += pytesseract.image_to_string(image, lang='eng')
        except:
            pdf = PdfReader(file_path)
            for page in pdf.pages:
                text += page.extract_text() or ""

    elif ext == 'docx':
        doc = docx.Document(file_path)
        text = '\n'.join([para.text for para in doc.paragraphs])

    elif ext == 'txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

    return text.strip()

text = extract_text_from_file(file_path)
print(f"Extracted {len(text)} characters")







Extracted 65147 characters


### Enhanced keyword extraction using TF-IDF + NER

In [50]:
def extract_enhanced_keywords(text, max_keywords=15):
    # text cleaning
    text_clean = re.sub(r'[^\w\s]', ' ', text.lower())

    # TF-IDF based keywords
    vectorizer = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(1, 3))
    tfidf_matrix = vectorizer.fit_transform([text_clean])
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.toarray()[0]

    #top TF-IDF keywords
    tfidf_keywords = [(feature_names[i], tfidf_scores[i]) for i in tfidf_scores.argsort()[-20:][::-1]]

    #Named entities as keywords
    doc = nlp(text)
    entities = [(ent.text.lower(), ent.label_) for ent in doc.ents if len(ent.text) > 2]

    all_keywords = []
    for kw, score in tfidf_keywords:
        if len(kw) > 2 and score > 0.01:
            all_keywords.append(kw)

    # important entities
    for ent, label in entities:
        if label in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT'] and ent not in all_keywords:
            all_keywords.append(ent)

    return all_keywords[:max_keywords]

keywords = extract_enhanced_keywords(text)

### Enhanced summarization using LLama 3.1 LLM

In [51]:
def create_intelligent_summary(text, max_length=300):
    if len(text.strip()) < 100:
        return text.strip()
    chunk_size = 8000
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    summaries = []

    for i, chunk in enumerate(chunks[:3]):
        try:
            response = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are an expert at creating concise, informative summaries. Extract the key points and main ideas from the text."
                    },
                    {
                        "role": "user",
                        "content": f"Summarize this text in 2-3 sentences, focusing on the main points:\n\n{chunk}"
                    }
                ],
                model="llama-3.1-8b-instant",
                max_tokens=150,
                temperature=0.3
            )
            summaries.append(response.choices[0].message.content.strip())
        except Exception as e:
            print(f"Error summarizing chunk {i+1}: {e}")
            sentences = chunk.split('. ')[:3]
            summaries.append('. '.join(sentences) + '.')

    # Combining summaries
    combined_summary = ' '.join(summaries)

    # If summary is still too long, summarizing the summaries
    if len(combined_summary) > max_length and len(summaries) > 1:
        try:
            response = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "Create a final concise summary from these partial summaries."
                    },
                    {
                        "role": "user",
                        "content": f"Combine these summaries into one coherent summary (max 2-3 sentences):\n\n{combined_summary}"
                    }
                ],
                model="llama-3.1-8b-instant",
                max_tokens=100,
                temperature=0.3
            )
            return response.choices[0].message.content.strip()
        except:
            return combined_summary[:max_length] + "..."

    return combined_summary


summary = create_intelligent_summary(text)

### Extracting Topics and themes from file

In [52]:
def extract_topics_and_themes(text, keywords):
    try:
        kw_text = ", ".join(keywords[:10])
        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are an expert at identifying main topics and themes in documents. Based on the keywords and content, identify 3-5 main topics/themes."
                },
                {
                    "role": "user",
                    "content": f"Based on these keywords: {kw_text}\n\nAnd this text sample: {text[:1000]}...\n\nIdentify the main topics/themes (return as a simple list):"
                }
            ],
            model="llama-3.1-8b-instant",
            max_tokens=100,
            temperature=0.4
        )
        topics_text = response.choices[0].message.content.strip()
        topics = [topic.strip('- ').strip() for topic in topics_text.split('\n') if topic.strip()]
        return topics[:5]
    except Exception as e:
        print(f"Error extracting topics: {e}")
        return keywords[:5]

topics = extract_topics_and_themes(text, keywords)

### Extracting Title, Named entities and language detection

In [53]:
doc = nlp(text)
entities = {}
for ent in doc.ents:
    if ent.label_ not in entities:
        entities[ent.label_] = []
    entities[ent.label_].append(ent.text)

for label in entities:
    entities[label] = list(set(entities[label]))[:10]

# Detecting language
try:
    language = detect(text)
except:
    language = "unknown"

# Extracting potential title
title = "Untitled"
lines = text.split('\n')
for line in lines[:10]:
    line = line.strip()
    if 10 < len(line) < 150 and not line.isdigit():
        title = line
        break




### Readability Metrics for Statistical Metadata

In [54]:
sentences = len(re.findall(r'[.!?]+', text))
words = len(text.split())
avg_words_per_sentence = words / max(sentences, 1)


### Final Metadata

In [55]:
metadata = {
    "filename": file_path,
    "title": title,
    "summary": summary,
    "topics_and_themes": topics,
    "keywords": keywords,
    "language": language,
    "statistics": {
        "word_count": words,
        "character_count": len(text),
        "sentence_count": sentences,
        "avg_words_per_sentence": round(avg_words_per_sentence, 2)
    },
    "named_entities": entities,
    "created_at": datetime.now().isoformat(),
    "extraction_method": "Enhanced with Groq LLM"
}

In [56]:
print("METADATA RESULTS")
print("="*60)
print(f"Title: {metadata['title']}")
print(f"\nFileName: :{metadata['filename']}")
print(f"\nSummary:\n{metadata['summary']}")
print(f"\nMain Topics: {', '.join(metadata['topics_and_themes'])}")
print(f"\nTop Keywords: {', '.join(metadata['keywords'][:10])}")
print(f"\nLanguage: {metadata['language']}")
print(f"\nStatistics: {metadata['statistics']}")
print(f"\nCreated at: {metadata['created_at']}")
print(f"\nNamed Entities: {metadata['named_entities']} types found")


METADATA RESULTS
Title: Guidelines for sustaining DBT- Bio-Banks and Cohorts

FileName: :Biobank-Cohort- sustainability guidelines (1).pdf

Summary:
Here's a concise summary of the main points in 2-3 sentences:

The Department of Biotechnology (DBT) aims to establish a strong governance structure and oversight mechanism to safeguard the scientific relevance and financial sustainability of bio-banks, cohorts, and demographic sites in India. A proposed governance model includes a national executive committee, site-specific governance committees, and annual reviews to ensure long-term sustainability, accountability, and transparency. The guidelines outline eligibility criteria and procedures for access to and sharing of samples and

Main Topics: Based on the keywords and content, here are the 3-5 main topics/themes:, 1. Bio-banks and cohorts, 2. Research and data management, 3. Governance and sustainability

Top Keywords: data, bio, repository, banks, bio banks, cohorts, samples, research

### Saving in JSON format

In [57]:
json_path = os.path.splitext(file_path)[0] + "_enhanced_metadata.json"
with open(json_path, "w", encoding='utf-8') as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)