In [3]:
!pip install pymupdf==1.18.19


Collecting pymupdf==1.18.19
  Downloading PyMuPDF-1.18.19-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.1 kB)
Downloading PyMuPDF-1.18.19-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
  Attempting uninstall: pymupdf
    Found existing installation: PyMuPDF 1.24.10
    Uninstalling PyMuPDF-1.24.10:
      Successfully uninstalled PyMuPDF-1.24.10
Successfully installed pymupdf-1.18.19


In [11]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Install Required Libraries
!apt-get install tesseract-ocr
!pip install pytesseract pdfplumber tabula-py

# Step 3: Import Libraries
import os
import re
import tabula
import pdfplumber
import pytesseract
from PIL import Image
import io
import pandas as pd
import spacy
import nltk
import string
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Load Spacy model for Named Entity Recognition
nlp = spacy.load('en_core_web_sm')

# Function for text normalization
def text_normalization(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to extract tables using Tabula-py and capture context around the table
def extract_tables_and_context(pdf_path, page_text, page_num, previous_page_text=None, lines_above=3, lines_below=3):
    tables = tabula.read_pdf(pdf_path, pages=page_num, multiple_tables=True)
    table_list = []
    all_lines = page_text.splitlines()

    if tables:
        for i, table in enumerate(tables):
            # Capture context above: If table is near the top of the page, look at the previous page's content
            if i == 0 and previous_page_text:
                previous_page_lines = previous_page_text.splitlines()
                context_above = "\n".join(previous_page_lines[-lines_above:])  # Get lines from the previous page
            else:
                context_above = "\n".join(all_lines[max(0, i - lines_above):i])

            # Capture context below
            context_below = "\n".join(all_lines[i + len(table):i + len(table) + lines_below])

            table_list.append({
                "table_number": i + 1,
                "table_data": table,
                "context_above": context_above,
                "context_below": context_below
            })
    return table_list

# Function to extract PDF content with OCR and tables with context
def extract_pdf_content_with_ocr(pdf_path, lines_above=3, lines_below=3):
    full_text = ""
    table_context_data = []
    previous_page_text = None

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            print(f"Processing Page {page_num + 1} of {pdf_path}")
            page_text = page.extract_text()

            if page_text:
                print(f"Extracted Text from Page {page_num + 1}:\n", page_text[:500])
                full_text += page_text + "\n\n"

                # Extract tables and context from this page
                tables_with_context = extract_tables_and_context(pdf_path, page_text, page_num + 1, previous_page_text, lines_above, lines_below)
                table_context_data.append({
                    "page_number": page_num + 1,
                    "tables": tables_with_context
                })

                previous_page_text = page_text  # Store the current page's text for context on the next page
            else:
                print(f"Using OCR for Page {page_num + 1}")
                page_image = page.to_image()
                image_bytes = page_image.original
                img = Image.open(io.BytesIO(image_bytes))
                ocr_text = pytesseract.image_to_string(img)
                full_text += ocr_text + "\n\n"

    return full_text, table_context_data

# Function to process PDF files and extract tables with context
def process_files(pdf_directory):
    file_names = [os.path.splitext(f)[0] for f in os.listdir(pdf_directory) if f.endswith('.pdf')]
    all_preprocessed_data = []

    for file_name in file_names:
        pdf_path = os.path.join(pdf_directory, file_name + ".pdf")

        # Extract content from PDF using OCR and Tabula
        pdf_content, table_context_data = extract_pdf_content_with_ocr(pdf_path)

        # Pre-process the content
        normalized_text = text_normalization(pdf_content)
        words, sentences = tokenize_text(normalized_text)
        words = remove_stopwords(words)
        lemmatized_words = lemmatize_tokens(words)
        named_entities = named_entity_recognition(normalized_text)

        # Store the pre-processed data
        all_preprocessed_data.append({
            "file_name": file_name,
            "normalized_text": normalized_text,
            "lemmatized_words": list(lemmatized_words),
            "named_entities": named_entities,
            "table_context_data": table_context_data  # Include tables with their context
        })

    return all_preprocessed_data

# Specify directories
pdf_directory = "/content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF"

# Process all files
all_preprocessed_data = process_files(pdf_directory)

# Print all preprocessed data
for data in all_preprocessed_data:
    print(f"\nFile: {data['file_name']}")
    print(f"Normalized Text:\n{data['normalized_text'][:500]}")
    print(f"Lemmatized Words:\n{data['lemmatized_words'][:20]}")
    print(f"Named Entities:\n{data['named_entities']}")
    for context_data in data['table_context_data']:
        print(f"Page {context_data['page_number']}:")
        for table in context_data['tables']:
            print(f"Table {table['table_number']}:\n{table['table_data']}")
            print(f"Context Above:\n{table['context_above']}")
            print(f"Context Below:\n{table['context_below']}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing Page 1 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF/ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf
Extracted Text from Page 1:
 Exhibit 10.2
Execution Version
INTELLECTUAL PROPERTY AGREEMENT
This INTELLECTUAL PROPERTY AGREEMENT (this “Agreement”), dated as of December 31, 2018 (the “Effective Date”) is entered into by and
between Armstrong Flooring, Inc., a Delaware corporation (“Seller”) and AFI Licensing LLC, a Delaware limited liability company (“Licensing” and
together with Seller, “Arizona”) and AHF Holding, Inc. (formerly known as Tarzan HoldCo, Inc.), a Delaware corporation (“Buyer”) and Armstrong
Hardwood Floorin
Processing Page 2 of /content/drive/MyDrive/AIML/Capstone-Project/data/LimitedData/PDF/ArmstrongFlooringInc_20190107_8-K_EX-10.2_11471795_EX-10.2_Intellectual Property Agreement.pdf
Extracted Text from Page 2:
 1. DEFINITIONS AND INTERPRETATION
1.1 Certain Definitions. As used herein, ca

In [12]:
!jupyter nbconvert --to html Capstone_Week1_plus_suggestions.ipynb

[NbConvertApp] Converting notebook Capstone_Week1_plus_suggestions.ipynb to html
[NbConvertApp] ERROR | Notebook JSON is invalid: Additional properties are not allowed ('metadata' was unexpected)

Failed validating 'additionalProperties' in stream:

On instance['cells'][1]['outputs'][0]:
{'metadata': {'tags': None},
 'name': 'stdout',
 'output_type': 'stream',
 'text': 'Drive already mounted at /content/drive; to attempt to forcibly '
         '...'}
[NbConvertApp] Writing 674441 bytes to Capstone_Week1_plus_suggestions.html
