In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Install Required Libraries
!apt-get install -y tesseract-ocr
!pip install pytesseract pdfplumber tabula-py

# Step 3: Import Libraries
import os
import re
import tabula
import pdfplumber
import pytesseract
from PIL import Image
import io
import pandas as pd
import spacy
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
import numpy as np

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load Spacy model for Named Entity Recognition
try:
    nlp = spacy.load('en_core_web_sm')
except:
    !python -m spacy download en_core_web_sm
    nlp = spacy.load('en_core_web_sm')

# Load transformer model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = TFAutoModel.from_pretrained(model_name)

# Preprocessing function for legal documents
def preprocess_text(text):
    doc = nlp(text)
    processed_text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return processed_text

# Bag of Words representation
def get_bow(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X

# TF-IDF representation
def get_tfidf(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X

# Word2Vec representation
def get_word2vec(texts):
    sentences = [text.split() for text in texts]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    vectors = []
    for sentence in sentences:
        word_vectors = [model.wv[word] for word in sentence if word in model.wv]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(model.vector_size))
    return np.array(vectors)

# Transformer Embeddings representation
def get_transformer_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="tf", padding=True, truncation=True)
    outputs = transformer_model(inputs['input_ids'])
    embeddings = tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy()
    return embeddings

# Compare different text representation methods
def compare_representations(texts):
    data = []

    if len(texts) < 2:
        print("Need at least two documents to compare representations.")
        return pd.DataFrame(data)

    # Bag of Words
    bow_matrix = get_bow(texts)
    bow_similarity = cosine_similarity(bow_matrix)
    data.append({
        "Method": "Bag of Words",
        "Mean Cosine Similarity": np.mean(bow_similarity[np.triu_indices_from(bow_similarity, k=1)])
    })

    # TF-IDF
    tfidf_matrix = get_tfidf(texts)
    tfidf_similarity = cosine_similarity(tfidf_matrix)
    data.append({
        "Method": "TF-IDF",
        "Mean Cosine Similarity": np.mean(tfidf_similarity[np.triu_indices_from(tfidf_similarity, k=1)])
    })

    # Word2Vec
    w2v_vectors = get_word2vec(texts)
    w2v_similarity = cosine_similarity(w2v_vectors)
    data.append({
        "Method": "Word2Vec",
        "Mean Cosine Similarity": np.mean(w2v_similarity[np.triu_indices_from(w2v_similarity, k=1)])
    })

    # Transformer Embeddings
    transformer_embeddings = get_transformer_embeddings(texts)
    transformer_similarity = cosine_similarity(transformer_embeddings)
    data.append({
        "Method": "Transformer Embeddings",
        "Mean Cosine Similarity": np.mean(transformer_similarity[np.triu_indices_from(transformer_similarity, k=1)])
    })

    return pd.DataFrame(data)

# Function for text normalization
def text_normalization(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to lemmatize tokens
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    pos_tagged = pos_tag(tokens)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tagged]
    return lemmatized_words

# Function to get WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Function for Named Entity Recognition (NER) using Spacy
def named_entity_recognition(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities

# Function to remove stopwords
def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

# Function to extract tables using Tabula-py and capture context around the table
def extract_tables_and_context(pdf_path, page_text, page_num, previous_page_text=None, lines_above=3, lines_below=3):
    try:
        tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True, lattice=True, stream=True)
    except Exception as e:
        print(f"Error extracting tables from page {page_num} of {pdf_path}: {e}")
        tables = []

    table_list = []
    all_lines = page_text.splitlines()

    if tables:
        for i, table in enumerate(tables):
            # Capture context above: If table is near the top of the page, look at the previous page's content
            if i == 0 and previous_page_text:
                previous_page_lines = previous_page_text.splitlines()
                context_above = "\n".join(previous_page_lines[-lines_above:])  # Get lines from the previous page
            else:
                # Assuming table starts at line `i`, which might not be accurate. Adjust as needed.
                context_above = "\n".join(all_lines[max(0, i - lines_above):i])

            # Capture context below
            context_below = "\n".join(all_lines[i + len(table):i + len(table) + lines_below])

            table_list.append({
                "table_number": i + 1,
                "table_data": table,
                "context_above": context_above,
                "context_below": context_below
            })
    return table_list

# Function to extract PDF content with OCR and tables with context
def extract_pdf_content_with_ocr(pdf_path, lines_above=3, lines_below=3):
    full_text = ""
    table_context_data = []
    previous_page_text = None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            print(f"Processing Page {page_num} of {pdf_path}")
            page_text = page.extract_text()

            if page_text:
                full_text += page_text + "\n\n"

                # Extract tables and context from this page
                tables_with_context = extract_tables_and_context(pdf_path, page_text, page_num, previous_page_text, lines_above, lines_below)
                if tables_with_context:
                    table_context_data.append({
                        "page_number": page_num,
                        "tables": tables_with_context
                    })

                previous_page_text = page_text  # Store the current page's text for context on the next page
            else:
                print(f"No extractable text found on Page {page_num} of {pdf_path}. Using OCR.")
                page_image = page.to_image()
                image_bytes = page_image.original
                img = Image.open(io.BytesIO(image_bytes))
                ocr_text = pytesseract.image_to_string(img)
                full_text += ocr_text + "\n\n"

    return full_text, table_context_data

# Function to process PDF files and extract tables with context
def process_files(pdf_directory):
    file_names = [f for f in os.listdir(pdf_directory) if f.lower().endswith('.pdf')]
    print(f"Found {len(file_names)} PDF file(s) in {pdf_directory}: {file_names}")
    all_preprocessed_data = []

    for file_name in file_names:
        base_name = os.path.splitext(file_name)[0]
        pdf_path = os.path.join(pdf_directory, file_name)
        print(f"\nProcessing file: {file_name}")

        # Extract content from PDF using OCR and Tabula
        pdf_content, table_context_data = extract_pdf_content_with_ocr(pdf_path)

        # Pre-process the content
        normalized_text = text_normalization(pdf_content)
        words = word_tokenize(normalized_text)
        words = remove_stopwords(words)
        lemmatized_words = lemmatize_tokens(words)
        named_entities = named_entity_recognition(normalized_text)

        # Store the pre-processed data
        all_preprocessed_data.append({
            "file_name": file_name,
            "normalized_text": normalized_text,
            "lemmatized_words": list(lemmatized_words),
            "named_entities": named_entities,
            "table_context_data": table_context_data  # Include tables with their context
        })

    return all_preprocessed_data

# Compare text representations using the real pre-processed data
def run_comparison_with_real_data(preprocessed_data):
    if not preprocessed_data:
        print("No preprocessed data available for comparison.")
        return

    normalized_texts = [data['normalized_text'] for data in preprocessed_data]
    print(f"\nComparing text representations for {len(normalized_texts)} document(s).")
    df_comparison = compare_representations(normalized_texts)
    if not df_comparison.empty:
        print("\nText Representation Comparison:")
        print(df_comparison)
    else:
        print("Comparison DataFrame is empty.")

# Specify directories
pdf_directory = "/content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF"

# Process all files
all_preprocessed_data = process_files(pdf_directory)

# Run comparison using the pre-processed data
run_comparison_with_real_data(all_preprocessed_data)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with anothe

Found 2 PDF file(s) in /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF: ['KNOWLABS,INC_08_15_2005-EX-10-INTELLECTUAL PROPERTY AGREEMENT.PDF', 'ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf']

Processing file: KNOWLABS,INC_08_15_2005-EX-10-INTELLECTUAL PROPERTY AGREEMENT.PDF
Processing Page 1 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF/KNOWLABS,INC_08_15_2005-EX-10-INTELLECTUAL PROPERTY AGREEMENT.PDF
Processing Page 2 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF/KNOWLABS,INC_08_15_2005-EX-10-INTELLECTUAL PROPERTY AGREEMENT.PDF
Processing Page 3 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF/KNOWLABS,INC_08_15_2005-EX-10-INTELLECTUAL PROPERTY AGREEMENT.PDF

Processing file: ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf
Processing Page 1 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF