In [2]:
import os
import pdfplumber
import pandas as pd

# Folder containing PDFs
pdf_folder = "My dataset"

# List to store extracted data
data = []

# Iterate over all PDF files
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, filename)

        with pdfplumber.open(file_path) as pdf:
            text_data = []
            
            # Extract text from each page
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:  # Avoid storing empty pages
                    text_data.append(page_text.strip())

        # Structure data in dictionary format
        extracted_data = {
            "filename": filename,
            "text_sections": text_data  # Store as a list of paragraphs
        }
        
        data.append(extracted_data)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save as a JSON file (better than CSV for structured text)
df.to_json("extracted_text.json", orient="records", indent=4)

# Print the first extracted document
print(df.head())

                  filename                                      text_sections
0  BILLS-118hjres119ih.pdf  [IA\n118TH CONGRESS H. J. RES. 119\n2D SESSION...
1   BILLS-118hr10473ih.pdf  [I\n118TH CONGRESS H. R. 10473\n2D SESSION\nTo...
2    BILLS-118hr1165rh.pdf  [IB\nUnion Calendar No. 673\n118TH CONGRESS H....
3    BILLS-118hr1560ih.pdf  [I\n118TH CONGRESS H. R. 1560\n1ST SESSION\nTo...
4    BILLS-118hr1810ih.pdf  [I\n118TH CONGRESS H. R. 1810\n1ST SESSION\nTo...


In [3]:
import re
import json
from transformers import GPT2Tokenizer

def clean_text(text):
    """Enhanced cleaning function with better space handling"""
    # Remove common unwanted patterns
    text = re.sub(r'VerDate\s[\w\s]+', '', text)
    text = re.sub(r'E:\\BILLS\\[A-Z0-9.]+', '', text)
    text = re.sub(r'Jkt\s\d+', '', text)
    text = re.sub(r'Frm\s\d+\sFmt\s\d+\sSfmt\s\d+', '', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'^\d+\s+', '', text)
    
    # Normalize spaces - replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Fix unicode characters
    text = text.replace("\u2018", "'").replace("\u2019", "'")
    text = text.replace("\u201c", '"').replace("\u201d", '"')
    text = text.replace("\u2022", '')
    
    return text

def preprocess_for_gpt(raw_data, max_tokens=1024):
    """More robust preprocessing pipeline"""
    # Initialize tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    
    # Add special tokens for legal domain
    special_tokens = ["[DOC_SEP]", "SEC", "H.R.", "S.", "U.S.C.", "Fed. Reg.", "§"]
    tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    
    processed_data = []
    
    for entry in raw_data:
        try:
            # Apply cleaning
            cleaned_text = clean_text(" ".join(entry["text_sections"]))
            
            # Ensure we have valid text
            if not cleaned_text or cleaned_text.isspace():
                continue
                
            # Add document separator
            formatted_text = f"{cleaned_text}[DOC_SEP]"
            
            # Tokenize and chunk
            tokens = tokenizer.tokenize(formatted_text)
            
            for i in range(0, len(tokens), max_tokens):
                chunk = tokens[i:i + max_tokens]
                
                # Skip chunks that are too short (except last one)
                if len(chunk) < 50 and i != 0:
                    continue
                    
                # Convert tokens to text more safely
                try:
                    chunk_text = tokenizer.convert_tokens_to_string(chunk)
                except KeyError:
                    # Fallback for problematic tokens
                    chunk_text = " ".join(chunk).replace("Ġ", " ")
                    
                processed_data.append({
                    "filename": entry["filename"],
                    "chunk_id": f"{entry['filename']}_{i//max_tokens}",
                    "text": chunk_text,
                    "token_count": len(chunk)
                })
        
        except Exception as e:
            print(f"Error processing {entry.get('filename', 'unknown')}: {str(e)}")
            continue
    
    return processed_data, tokenizer

# Load raw data
with open("extracted_text.json", "r") as f:
    raw_data = json.load(f)

# Process data
processed_data, tokenizer = preprocess_for_gpt(raw_data)

# Save processed data
with open("gpt_ready_data.json", "w") as f:
    json.dump({
        "metadata": {
            "tokenizer_special_tokens": tokenizer.additional_special_tokens,
            "max_tokens": 1024,
            "total_chunks": len(processed_data)
        },
        "data": processed_data
    }, f, indent=4)

# Save tokenizer for later use
tokenizer.save_pretrained("./legal_gpt_tokenizer")

print(f"Preprocessing complete. Generated {len(processed_data)} chunks.")
if processed_data:
    print("Sample chunk:")
    print(processed_data[0]["text"][:200] + "...")

Preprocessing complete. Generated 193 chunks.
Sample chunk:
IA  118 TH  CON GR ESS  H .  J .  RE S.  119  2 D  S ESSION  Prov iding  for  congressional  disapproval  under  chapter  8  of  title  5 ,  United  States  Code ,  of  the  rule  submitted  by  the  ...


In [5]:
import json

with open("gpt_ready_data.json", "r") as f:
    data = json.load(f)

# Validate each entry
for entry in data:
    assert "filename" in entry, "Missing filename!"
    assert "clean_text" in entry, "Missing clean_text!"
    assert isinstance(entry["clean_text"], str), "Text must be a string!"

AssertionError: Missing filename!