In [1]:
import fitz
import pandas as pd
import json
from nltk import sent_tokenize
import nltk
import re

nltk.download('punkt')

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

def clean_text(text):
    # Remove non-alphabetic characters and punctuation
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    # Remove extra whitespaces
    cleaned_text = ' '.join(cleaned_text.split())
    return cleaned_text

def split_content(content):
    sentences = sent_tokenize(content)
    return sentences

def create_dataframe(pdf_paths):
    data = []

    for idx, pdf_path in enumerate(pdf_paths):
        content = extract_text_from_pdf(pdf_path)
        content = clean_text(content)
        sentences = split_content(content)

        for i, sentence in enumerate(sentences):
            entry = {
                "id": f"pdf_{idx + 1}_{i + 1}",
                "content": sentence,
                "content_type": "text",
            }

            data.append(entry)

    return pd.DataFrame(data)

def save_dataframe_to_parquet(dataframe, output_file):
    dataframe.to_parquet(output_file, engine='pyarrow')

if __name__ == "__main__":
    # Replace with the paths to your PDF files
    pdf_files = ["data/Colliery_control_order_0_0.pdf","data/cba.pdf","data/DGMScircular1_2023.pdf","data/Indian-Explosives-Act.pdf","data/land_acquisition_act_1894.pdf","data/theminesact1952.pdf"]

    output_parquet_file = "dataset.parquet"

    created_dataframe = create_dataframe(pdf_files)
    save_dataframe_to_parquet(created_dataframe, output_parquet_file)

    print(f"Dataset created and saved to {output_parquet_file}")


Dataset created and saved to dataset.parquet


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tribhangind/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
