In [None]:
import os, time, json
from datetime import datetime
import boto3
import pandas as pd


## OCR Section
- In this section, we loop through all the files in our data folder, to extract any and all information in the PDF we have. We use Google Cloud (GCP) for this. However, you can switch to any other cloud OCR model of your choice. You will have to create your own GCP project, and use your credentials (GCP provides $300 free credits for all users, for 3 momths)
- To note:
     -  In this project, the professor had no use case for the first page of every pdf, so the code explicitly skips the first page (0 in python index)


In [None]:
## Import GCP credentials from file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "name of your own json with GCP credentials"

In [None]:
import os
import io
import time
import fitz
from google.cloud import vision

# ---------------------
# CONFIG
# ---------------------

DATA_DIR = "Newsletter_OCR_LLM_Project/Data"
OUTPUT_DIR = os.path.join(DATA_DIR, "OCR_versions")

os.makedirs(OUTPUT_DIR, exist_ok=True)

PAGES_TO_SKIP = {0}       # PDF page indexes to skip (0-based)
SLEEP_BETWEEN_CALLS = 0.5 # seconds between API calls

client = vision.ImageAnnotatorClient()

#Looping files for OCR

pdf_files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith(".pdf")])

print(f"Found {len(pdf_files)} PDF files.\n")

for file_name in pdf_files:
    pdf_path = os.path.join(DATA_DIR, file_name)

    # Output filename: <original>_ocr.txt
    base_name = os.path.splitext(file_name)[0]
    out_path = os.path.join(OUTPUT_DIR, f"{base_name}_ocr.txt")

    # Skip if OCR file already exists
    if os.path.exists(out_path):
        print(f"Skipping (already OCR'd): {file_name}")
        continue

    print(f"Processing: {file_name}")
    doc = fitz.open(pdf_path)

    all_text = []
    newsletter_page_no = 0

    for pdf_index in range(len(doc)):
        if pdf_index in PAGES_TO_SKIP:
            continue

        newsletter_page_no += 1
        page = doc.load_page(pdf_index)

        # Render page to image
        pix = page.get_pixmap(dpi=300)
        img_bytes = io.BytesIO(pix.tobytes("png"))
        image = vision.Image(content=img_bytes.getvalue())

        # OCR call
        response = client.document_text_detection(image=image)

        # Extract text robustly
        text = ""
        if response.full_text_annotation and response.full_text_annotation.text:
            text = response.full_text_annotation.text
        elif response.text_annotations:
            text = response.text_annotations[0].description

        # Annotate for clarity
        all_text.append(
            f"\n--- Newsletter Page {newsletter_page_no} (PDF page {pdf_index + 1}) ---\n{text}"
        )

        time.sleep(SLEEP_BETWEEN_CALLS)

    # Save OCR text
    with open(out_path, "w", encoding="utf-8") as f:
        f.writelines(all_text)

    print(f"Saved → {out_path}\n")

print("All PDFs processed.")


In [None]:
# Have a look at your OCR utput
path = "PATH TO OUTPUT HERE"

with open(path, "r", encoding="utf-8") as f:
    text = f.read()

print(text)

## The output here is all in a single column, and has certain errors in heirarchy. We need to make sure that any information that is extracted or parsed, has the correct creditentials for each article. To fix this, we use an LLM

# Creating Parsed versions
### - To now create parsed versions, we write a prompt to instruct the LLM to re format the OCR version into a parsable version. We also ask for metada extraction, so that every volume, is clearly marked. Every article/review etc will first start with metadata, detailing information such as atuhor, date, the type of information presented and the volume and issue number. Eack artile will then become parsable as the metadata acts as a delimitter, making database creation easier

In [None]:
#Defining the Prompt:
PROMPT = """ 
You are an archivist re-structuring OCR text from historical newsletters.

You will receive the full OCR text (from Google Cloud Vision) for an issue of the *Women Artists Newsletter*.
The OCR text is accurate but unformatted. Your job is to **reconstruct the structure without changing, summarizing, or omitting any content.**

---

### GOAL
Produce a single, plain-text file that:
- contains **every word** of the OCR text, exactly as recognized;
- merges “continued on page …” segments into one continuous article;
- organizes all sections by category (masthead, articles, calendar, advertisements);
- formats each section using a clear, simple, line-based structure.

---

### RULES

1. **Do not summarize, paraphrase, or skip content.**
   Include every line and paragraph from the OCR text verbatim.Please make sure you finish the articles. Do not summarise. I need the ocr versions to be reformatted for readbale use. 

2. **Correct structure only.**
   - If a byline (e.g., “--Judy Seigel” or “Written by …”) appears misplaced, attach it to the correct article.
   - Merge multi-page articles that were split across OCR pages.
   - Preserve bullet lists, calendars, and exhibition entries exactly.
   - The final look should be smooth paragraphs, they should not look like a column, but properly formatted to look. 

3. **Categorize each section** as one of the following types:
   - MASTHEAD  
   - ARTICLE  
   - CALENDAR  
   - ADVERTISEMENT  
   - EDITORIAL
   - REVIEW
   - PANEL
   - LETTER
   - SUBSCRIPTION

4. **Output order:**
   1. Masthead / Index (if present)  
   2. Articles  
   3. Calendar / Exhibitions  
   4. Advertisements / Subscriptions  

5. **Output format:**
   Each section should be separated by one blank line and follow this simple labeled structure:

    TITLE: <exact title as printed>
    WRITER: <writer(s) if visible, otherwise "unknown">
    PAGE_NUMBERS: <newsletter page numbers>
    VOLUME: <volume number>
    ISSUE: <issue number>
    SEASON_YEAR: <season and year, e.g., April 1975>
    TYPE: <<article | editorial | review | letter | calendar | advertisement | masthead | subscription | panel>
    CONTENT:
    <verbatim OCR text of this section, with full paragraphs and lists preserved>


6. **Formatting details:**
- Do not include extra symbols, brackets, or markers.
- Keep one blank line between sections.
- Preserve all paragraph breaks and spacing from the OCR text.
- Output should be plain text, not JSON or Markdown.

---

### EXAMPLE OUTPUT

TITLE: ARTISTS, DEALERS AND ECONOMICS AT A.I.R. APRIL 7TH  
WRITER: Judy Seigel  
PAGE_NUMBERS: 1–2  
VOLUME: 1  
ISSUE: 1  
SEASON_YEAR: April 1975  
TYPE: ARTICLE  
CONTENT:  
Art Dealers Rosa Esman, Betty Parsons and Virginia Zabriskie, artists Rosemarie Castoro and Laurace James, 
and moderator Maude Boltz opened the Third year of A.I.R.’s Monday evening programs.  
They offered a candid and engaging discussion of the economics of art and the realities of sustaining creative work...  

TITLE: CALENDAR–EXHIBITIONS  
WRITER: unknown  
PAGE_NUMBERS: 3–4  
VOLUME: 1  
ISSUE: 1  
SEASON_YEAR: April 1975  
TYPE: CALENDAR  
CONTENT:  
CECILE ABISH – “Shifting Concern,” Douglass College Campus, New Brunswick, NJ, through May 31.  
PATRICIA ADAMS – Paintings on unstretched canvases, Central Hall Gallery, Port Washington, NY, through April 27.  
ROSEMARIE BECK – Poindexter Gallery, 24 E. 84 St., New York, April 22–May 10.  
(…full list continues)

"""

In [None]:
#We connect to LLM's using Portkey, the following code initializes and helps to check if your system is connected. If you timeout, Please check to see you are either using NYU VPN, or the NYU wifi.

from portkey_ai import Portkey

portkey = Portkey(
  base_url = "https://ai-gateway.apps.cloud.rt.nyu.edu/v1",
  api_key = "YOUR OWN API KEY HERE"
)

response = portkey.chat.completions.create(
    model = "@gpt-4o/gpt-4o",
    messages = [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "What is Portkey"}
    ],
    MAX_TOKENS = 512
)

print(response.choices[0].message.content)


### Great! You got a response! 
- #### If you did not get any reponse, please make sure of the following:

    - You are within the NYU environment (either via WIFI or VPN)
    - You have the correct API key input in the code
    - You are using a model you have access too (can check this in the "model catalog" section of the menu)
    - You are in the correct workspace (top left corner of the portkey page)
    - You are in the correct organisation (bottom left corner of the portkey page)
    - You are NOT using Google Collab. (Google collab works in it's own separate environment, NYU VPN/WIFI will NOT help in connecting to the correct environment)



- #### If you passed all of these checks, and are still timing out, or getting an un known error, you should a) check Portkey Documentation for any important infomration, b) Contact the person who helped you with getting portkey resources. 

### Let's now run the code to make a parsable version of the OCR

In [None]:
from portkey_ai import Portkey

In [None]:
# format_newsletter_single_file.py
import os
from portkey_ai import Portkey

# === Input and output paths ===
INPUT_FILE = "Newsletter_OCR_LLM_Project/OCR_versions/1976_06-01_Vol.2_No.3_compressed_ocr.txt"
OUTPUT_DIR = "Newsletter_OCR_LLM_Project/parsable_versions"

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Portkey setup ===
portkey = Portkey(
    base_url="https://ai-gateway.apps.cloud.rt.nyu.edu/v1",
    api_key="YOUR API KEY HERE",
)

MODEL = "@bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"
MAX_TOKENS = 16384

# === Derive output filename ===
filename = os.path.basename(INPUT_FILE)

base = filename.replace("_compressed_ocr", "")
out_name = base.replace(".txt", "_parsable.txt")

output_path = os.path.join(OUTPUT_DIR, out_name)

print(f"\nProcessing → {filename}")

# === Load OCR text ===
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    ocr_text = f.read().replace("\r\n", "\n").replace("\r", "\n")

# === Construct messages ===
messages = [
    {"role": "user", "content": f"{PROMPT}\n\nHere is the OCR text for the full newsletter:\n\n{ocr_text}"}
]

# === Generate completion ===
completion = portkey.chat.completions.create(
    messages=messages,
    model=MODEL,
    max_tokens=MAX_TOKENS,
)

readable_text = completion.choices[0].message.get("content", "")

# === Save output ===
with open(output_path, "w", encoding="utf-8") as f:
    f.write(readable_text)

print(f"✔ Saved → {output_path}")
print("\nSingle file processed successfully!")


In [None]:
# format_newsletter_full_issue_txt_loop.py
import os
from portkey_ai import Portkey

# === Input and output directories ===
INPUT_DIR = "Newsletter_OCR_LLM_Project/OCR_versions"
OUTPUT_DIR = "Newsletter_OCR_LLM_Project/parsable_versions"

# Create output folder if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Portkey setup ===
portkey = Portkey(
    base_url="https://ai-gateway.apps.cloud.rt.nyu.edu/v1",
    api_key="YOUR API KEY HERE",
)

MODEL = "@bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0"
MAX_TOKENS = 16384

# === Loop through all OCR files ===
for filename in os.listdir(INPUT_DIR):

    if not filename.endswith(".txt"):
        continue  

    input_path = os.path.join(INPUT_DIR, filename)

    # Output filename: replace _ocr with _parsable
    base = filename.replace("_compressed_ocr", "")
    out_name = base.replace(".txt", "_parsable.txt")
    output_path = os.path.join(OUTPUT_DIR, out_name)

    print(f"\nProcessing → {filename}")

    # Load OCR text
    with open(input_path, "r", encoding="utf-8") as f:
        ocr_text = f.read().replace("\r\n", "\n").replace("\r", "\n")

    # Construct messages
    messages = [
        {"role": "user", "content": f"{PROMPT}\n\nHere is the OCR text for the full newsletter:\n\n{ocr_text}"}
    ]

    # Generate completion
    completion = portkey.chat.completions.create(
        messages=messages,
        model=MODEL,
        max_tokens=MAX_TOKENS,
    )

    readable_text = completion.choices[0].message.get("content", "")

    # Save formatted output
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(readable_text)

    print(f"Saved → {output_path}")

print("\nAll files processed successfully!")


## Let's have a look at the file, and you should see the metadata extraction
- This file was then shared with a software engineer, who parsed all the different articles, authros etc, and turned it onto an online database, for the given dataset (can be seen)

In [None]:
path = "Newsletter_OCR_LLM_Project/parsable_versions/Vol_4_Issue_9_parsable.txt"

with open(path, "r", encoding="utf-8") as f:
    text = f.read()

print(text)

## Bonus use case: Indexing. 
- The original aim of this project was to create indexes for each newsletter, this was created directly from the OCR files, in the code cells below, is the prompt to create an index, and the code to run the same

In [6]:
#Indexing Prompt
index_prompt_text = """
You are an archivist assistant working with OCR-extracted content from historical feminist newsletters.

Your task is to create a structured, article-wise index from the provided OCR JSON data of a newsletter issue.

---

Your output must contain these sections, in order:

### 1. CONTRIBUTORS:
List all contributors (people who wrote or signed articles, letters, editorials, reviews, or are explicitly listed in mastheads).  
Format:
- <Name> (role or contribution, e.g., "publisher and editor," "review author of Ellen Banks," "contributing editor")  

Include contributors found in:
- Mastheads, staff boxes, editorial credits
- Signed articles, reviews, or letters  
Do **not** include people who are only mentioned in passing.  

---

### 2. PEOPLE SUBSTANTIALLY COVERED:
List individuals who are the primary focus of an article, review, or extended discussion (e.g., a featured artist, critic, or theorist).  
Format:
- <Name> (short note on how they are covered, e.g., "subject of review," "artist featured in exhibition"), in "<Article Title>"  

This section highlights people who are written about in depth.  
Do **not** include individuals who are only briefly cited or quoted.

---

### 3. ARTICLE INDEX:
List every article in the issue in order of appearance.  
For each article, provide:
- Article Title (as printed)  
- Author(s)  
- Primary subjects (artists, exhibitions, or movements discussed)  

Do not summarize the article — just capture title, author, and subjects.

---

### 4. CALENDAR EXHIBITIONS:
List all artists, galleries, or exhibition spaces that appear under calendar- or listing-type sections (e.g., “Calendar,” “Exhibitions,” “Events, Conferences & Symposia,” “Listings”).  
Format:
- <Artist or Entity Name> (<exact section heading>)

Do not duplicate these entries in the Article Index.

---

### 5. ADVERTISEMENTS:
List all persons, groups, or businesses mentioned on pages flagged as advertisements.  
Format:
- adv(<name or entity>)

---

### GENERAL INSTRUCTIONS:
- Do not organize by page number—organize by **article and section**.  
- Use the newsletter’s stated volume, issue, and season/year where needed (do not infer).  
- Alphabetize entries within each section except the Article Index (which should follow the order of appearance).  
- For names, always include a short, research-useful note on their role or coverage — avoid vague filler.  
- Output must be plain, human-readable, and copy-paste friendly.  
- End the index with "the index ends here".
"""


In [7]:
from portkey_ai import Portkey

portkey = Portkey(
    base_url="https://ai-gateway.apps.cloud.rt.nyu.edu/v1",
    api_key= "YOUR API KEY HERE" 
)

# --- File paths ---
OCR_PATH = "/Users/jovitagandhi/Desktop/JoFo/Education/Masters/NYU/RA_Gen_AI/OCR_Books/Newsletter_OCR_LLM_Project/OCR_versions/1976_01-01_Vol.1_No.8_compressed_ocr.txt"
OUT_FILE = "Vol_1_Issue_8_index.txt"   

MODEL =  "@bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0" 
MAX_TOKENS = 16384 

# --- Load OCR text ---
with open(OCR_PATH, "r", encoding="utf-8") as f:
    ocr_text = f.read().replace("\r\n", "\n").replace("\r", "\n")

# --- Construct messages ---
messages = [
    {"role": "user", "content": f"{index_prompt_text}\n\nHere is the OCR text for the full newsletter:\n\n{ocr_text}"}
]

# --- Generate completion ---
completion = portkey.chat.completions.create(
    messages=messages,
    model=MODEL,
    max_tokens=MAX_TOKENS,
    temperature=0
)

# --- Extract content safely ---
readable_text = (
    getattr(completion.choices[0].message, "content", "")
    or completion.choices[0].message.get("content", "")
)

# --- Save as plain text ---
with open(OUT_FILE, "w", encoding="utf-8") as f:
    f.write(readable_text)

print(f"Text file saved → {OUT_FILE}")


Text file saved → Vol_1_Issue_8_index.txt


In [None]:
path = "YOUR_FILE_PATH"

with open(path, "r", encoding="utf-8") as f:
    text = f.read()

print(text)