Install required packages

In [1]:
import subprocess
import sys

packages = [
    'gradio',
    'PyPDF2',
    'python-docx',
    'openpyxl',
    'pytesseract',
    'Pillow',
    'transformers',
    'torch',
    'nltk',
    'spacy',
    'textstat',
    'langdetect',
    'pyngrok'
]

print("Installing required packages...")
for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install additional system dependencies for OCR
subprocess.check_call(["apt-get", "update"])
subprocess.check_call(["apt-get", "install", "-y", "tesseract-ocr"])

print("All packages installed successfully!")

Installing required packages...
All packages installed successfully!


Import all necessary libraries

In [2]:
import gradio as gr
import PyPDF2
import docx
import openpyxl
import pytesseract
from PIL import Image
import io
import os
import json
import datetime
from pathlib import Path
import nltk
import spacy
import textstat
from langdetect import detect
import hashlib
import mimetypes
from collections import Counter
import re

Download required NLTK data

In [3]:
print("Downloading NLTK data...")
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

Downloading NLTK data...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Load spaCy model

In [4]:
print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

# Step 5: Text extraction functions for different file formats

# PDF text extraction
def extract_text_from_pdf(file_path):
    """Extract text content from PDF files"""
    try:
        text = ""
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        return f"Error extracting PDF: {str(e)}"

# DOCX text extraction
def extract_text_from_docx(file_path):
    """Extract text content from DOCX files"""
    try:
        doc = docx.Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text.strip()
    except Exception as e:
        return f"Error extracting DOCX: {str(e)}"

# Excel text extraction
def extract_text_from_excel(file_path):
    """Extract text content from Excel files"""
    try:
        workbook = openpyxl.load_workbook(file_path)
        text = ""
        for sheet_name in workbook.sheetnames:
            sheet = workbook[sheet_name]
            text += f"Sheet: {sheet_name}\n"
            for row in sheet.iter_rows(values_only=True):
                row_text = " ".join([str(cell) if cell is not None else "" for cell in row])
                if row_text.strip():
                    text += row_text + "\n"
        return text.strip()
    except Exception as e:
        return f"Error extracting Excel: {str(e)}"

# Image OCR text extraction
def extract_text_from_image(file_path):
    """Extract text from images using OCR"""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        return f"Error extracting image text: {str(e)}"

# TXT file extraction
def extract_text_from_txt(file_path):
    """Extract text from plain text files"""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except Exception as e:
        return f"Error extracting TXT: {str(e)}"

Loading spaCy model...


Main text extraction function

In [5]:
def extract_text_from_file(file_path):
    """Extract text from various file formats"""
    file_extension = Path(file_path).suffix.lower()

    if file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension in ['.xlsx', '.xls']:
        return extract_text_from_excel(file_path)
    elif file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp']:
        return extract_text_from_image(file_path)
    elif file_extension == '.txt':
        return extract_text_from_txt(file_path)
    else:
        return "Unsupported file format"

Semantic content identification functions

In [6]:
def identify_key_entities(text):
    """Identify named entities in the text"""
    doc = nlp(text[:1000000])  # Limit text length for processing
    entities = []
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'description': spacy.explain(ent.label_)
        })
    return entities

def extract_keywords(text, num_keywords=10):
    """Extract important keywords from text"""
    doc = nlp(text[:1000000])

    # Filter tokens (remove stop words, punctuation, spaces)
    tokens = [token.lemma_.lower() for token in doc
              if not token.is_stop and not token.is_punct and not token.is_space
              and len(token.text) > 2]

    # Count frequency
    word_freq = Counter(tokens)
    return word_freq.most_common(num_keywords)

def analyze_document_structure(text):
    """Analyze document structure and content"""
    lines = text.split('\n')

    structure_info = {
        'total_lines': len(lines),
        'non_empty_lines': len([line for line in lines if line.strip()]),
        'average_line_length': sum(len(line) for line in lines) / len(lines) if lines else 0,
        'has_headers': any(line.isupper() and len(line.split()) <= 5 for line in lines[:20]),
        'has_numbered_sections': any(re.match(r'^\d+\.', line.strip()) for line in lines),
        'has_bullet_points': any(line.strip().startswith(('•', '-', '*')) for line in lines)
    }

    return structure_info

 Content analysis functions

In [7]:
def analyze_readability(text):
    """Analyze text readability metrics"""
    if not text.strip():
        return {}

    readability_scores = {
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'reading_time_minutes': textstat.reading_time(text, ms_per_char=14.69)
    }

    return readability_scores

def detect_language(text):
    """Detect the language of the text"""
    try:
        if text.strip():
            return detect(text[:1000])  # Use first 1000 chars for detection
        return "unknown"
    except:
        return "unknown"

def get_text_statistics(text):
    """Get basic text statistics"""
    words = text.split()
    sentences = nltk.sent_tokenize(text)

    stats = {
        'character_count': len(text),
        'character_count_no_spaces': len(text.replace(' ', '')),
        'word_count': len(words),
        'sentence_count': len(sentences),
        'paragraph_count': len([p for p in text.split('\n\n') if p.strip()]),
        'average_words_per_sentence': len(words) / len(sentences) if sentences else 0,
        'average_characters_per_word': sum(len(word) for word in words) / len(words) if words else 0
    }

    return stats

File information extraction

In [8]:
def get_file_info(file_path):
    """Get basic file information"""
    try:
        file_stat = os.stat(file_path)
        file_info = {
            'filename': os.path.basename(file_path),
            'file_extension': Path(file_path).suffix.lower(),
            'file_size_bytes': file_stat.st_size,
            'file_size_mb': round(file_stat.st_size / (1024 * 1024), 2),
            'created_date': datetime.datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
            'modified_date': datetime.datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
            'mime_type': mimetypes.guess_type(file_path)[0]
        }

        # Generate file hash for uniqueness
        with open(file_path, 'rb') as f:
            file_hash = hashlib.md5(f.read()).hexdigest()
        file_info['file_hash'] = file_hash

        return file_info
    except Exception as e:
        return {'error': f"Error getting file info: {str(e)}"}

Main metadata generation function

In [9]:
def generate_metadata(file_path, text_content):
    """Generate comprehensive metadata for a document"""

    print("Generating metadata...")

    # Basic file information
    file_info = get_file_info(file_path)

    # Text statistics
    text_stats = get_text_statistics(text_content)

    # Language detection
    language = detect_language(text_content)

    # Readability analysis
    readability = analyze_readability(text_content)

    # Document structure analysis
    structure = analyze_document_structure(text_content)

    # Keywords extraction
    keywords = extract_keywords(text_content)

    # Named entities
    entities = identify_key_entities(text_content)

    # Compile all metadata
    metadata = {
        'generation_timestamp': datetime.datetime.now().isoformat(),
        'file_information': file_info,
        'content_statistics': text_stats,
        'language': language,
        'readability_scores': readability,
        'document_structure': structure,
        'keywords': [{'word': word, 'frequency': freq} for word, freq in keywords],
        'named_entities': entities[:20],  # Limit to top 20 entities
        'content_preview': text_content[:500] + "..." if len(text_content) > 500 else text_content,
        'metadata_version': '1.0'
    }

    return metadata


Gradio interface functions

In [10]:
def process_file(file):
    """Process uploaded file and generate metadata"""
    if file is None:
        return "Please upload a file first.", "{}"

    try:
        # Extract text from file
        text_content = extract_text_from_file(file.name)

        if text_content.startswith("Error") or text_content == "Unsupported file format":
            return text_content, "{}"

        # Generate metadata
        metadata = generate_metadata(file.name, text_content)

        # Format metadata as JSON string for display
        metadata_json = json.dumps(metadata, indent=2, ensure_ascii=False)

        # Create summary for display
        summary = f"""
        📄 **File Analysis Complete!**

        **File:** {metadata['file_information'].get('filename', 'Unknown')}
        **Size:** {metadata['file_information'].get('file_size_mb', 0)} MB
        **Language:** {metadata.get('language', 'Unknown')}
        **Words:** {metadata['content_statistics'].get('word_count', 0):,}
        **Characters:** {metadata['content_statistics'].get('character_count', 0):,}
        **Reading Time:** {metadata['readability_scores'].get('reading_time_minutes', 0):.1f} minutes

        **Top Keywords:**
        {chr(10).join([f"• {kw['word']} ({kw['frequency']})" for kw in metadata['keywords'][:5]])}

        **Content Preview:**
        {metadata['content_preview']}
        """

        return summary, metadata_json

    except Exception as e:
        return f"Error processing file: {str(e)}", "{}"

def download_metadata(metadata_json):
    """Create downloadable metadata file"""
    if not metadata_json or metadata_json == "{}":
        return None

    # Save metadata to temporary file
    temp_file = "/tmp/metadata.json"
    with open(temp_file, 'w', encoding='utf-8') as f:
        f.write(metadata_json)

    return temp_file

Create Gradio interface

In [11]:
print("Creating web interface...")

# Custom CSS for better styling
custom_css = """
.gradio-container {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.main-header {
    text-align: center;
    color: #2c3e50;
    margin-bottom: 2rem;
}
.upload-area {
    border: 2px dashed #3498db;
    border-radius: 10px;
    padding: 2rem;
    text-align: center;
    background-color: #f8f9fa;
}
"""

# Create the interface
with gr.Blocks(css=custom_css, title="Automated Metadata Generation System") as interface:

    # Header
    gr.Markdown("""
    # 🤖 Automated Metadata Generation System

    Upload any document (PDF, DOCX, Excel, Images, TXT) and get comprehensive metadata analysis including:
    - **Content Statistics** (word count, readability scores)
    - **Semantic Analysis** (keywords, entities)
    - **Document Structure** analysis
    - **Language Detection**
    - **File Information** and more!
    """, elem_classes=["main-header"])

    with gr.Row():
        with gr.Column(scale=1):
            # File upload
            file_input = gr.File(
                label="📁 Upload Document",
                file_types=[".pdf", ".docx", ".xlsx", ".xls", ".txt", ".png", ".jpg", ".jpeg"],
                elem_classes=["upload-area"]
            )

            # Process button
            process_btn = gr.Button("🔍 Generate Metadata", variant="primary", size="lg")

        with gr.Column(scale=2):
            # Results display
            result_display = gr.Markdown(label="📊 Analysis Results")

    # Metadata JSON output (hidden by default)
    with gr.Accordion("🔧 Raw Metadata (JSON)", open=False):
        metadata_output = gr.Textbox(
            label="Complete Metadata",
            lines=20,
            max_lines=30,
            show_copy_button=True
        )

    # Download section
    with gr.Row():
        download_btn = gr.File(label="💾 Download Metadata JSON", visible=True)

    # Event handlers
    process_btn.click(
        process_file,
        inputs=[file_input],
        outputs=[result_display, metadata_output]
    )

    # Auto-generate download file when metadata is updated
    metadata_output.change(
        download_metadata,
        inputs=[metadata_output],
        outputs=[download_btn]
    )

    # Example section
    gr.Markdown("""
    ## 📝 Supported File Types:
    - **PDF** documents
    - **Microsoft Word** (.docx)
    - **Excel** spreadsheets (.xlsx, .xls)
    - **Text** files (.txt)
    - **Images** with text (.png, .jpg, .jpeg) - OCR enabled

    ## 🎯 Features:
    - ✅ Automatic text extraction
    - ✅ Language detection
    - ✅ Readability analysis
    - ✅ Keyword extraction
    - ✅ Named entity recognition
    - ✅ Document structure analysis
    - ✅ Downloadable metadata
    """)


Creating web interface...


 Launch the interface

In [12]:
print("Launching the web interface...")
print("The interface will be available at the URL shown below.")
print("You can upload documents and generate metadata automatically!")

# Launch with public sharing enabled
interface.launch(
    share=True,  # This creates a public URL
    server_name="0.0.0.0",
    server_port=7860,
    show_error=True,
    debug=True
)

print("System is now running! Upload a document to generate metadata.")

Launching the web interface...
The interface will be available at the URL shown below.
You can upload documents and generate metadata automatically!
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://9419117d641efa9fa5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Generating metadata...
Generating metadata...
Keyboard interruption in main thread... closing server.
Killing tunnel 0.0.0.0:7860 <> https://9419117d641efa9fa5.gradio.live
System is now running! Upload a document to generate metadata.
