# Passim Functionality

In [None]:
import json
from urllib.request import urlopen
import pandas as pd
import sys
import passim
import glob, itertools

### 1. The full text of the Star-Spangled Banner

In [None]:
ssb_fulltext = 'O say can you see by the dawn s early light What so proudly we hail d at the twilight s last gleaming whose broad stripes and bright stars through the perilous fight O er the ramparts we watch d were so gallantly streaming And the Rockets red glare the Bombs bursting in air Gave proof through the night that our flag was still there O say does that star spangled Banner yet wave O er the Land of the free and the home of the brave On the shore dimly seen through the mists of the deep Where the foe s haughty host in dread silence reposes What is that which the breeze o er the towering steep As it fitfully blows half conceals half discloses Now it catches the gleam of the morning s first beam In full glory reflected now shines on the stream Tis the star spangled banner O long may it wave O er the land of the free and the home of the brave And where is that band who so vauntingly swore That the havoc of War and the battle s confusion A home and a country should leave us no more Their blood has wash d out their foul foot steps pollution No refuge could save the hireling and slave From the terror of flight or the gloom of the grave And the star spangled banner in triumph doth wave O er the land of the free and the home of the brave O thus be it ever when freemen shall stand Between their lov d home and the war s desolation Blest with vict ry and peace may the Heav n rescued land Praise the power that hath made and preserv d us a nation Then conquer we must when our cause it is just And this be our motto In GOD is our Trust And the star spangled banner in triumph shall wave O er the land of the free and the home of the brave'

### 2. load the toy dataset

This dataset contains newspaper data from the Bedford Inquirer between 1861 and 1865. It was scraped from Chronicling America. It is also hand-keyed to count pages with a reprinting of the Star-Spangled Banner. If the column ssb_present says 'yes', the corresponding page has a Star-Spangled Banner reprint.

In [None]:
bedford_inquirer_df = pd.read_csv('bedford_inquirer1861-1865.csv')

In [None]:
bedford_inquirer_df

In [None]:
bedford_inquirer_df = bedford_inquirer_df[bedford_inquirer_df['ssb_present'] == 'yes']
bedford_inquirer_df

### 3. preprocess data to be compatible with Passim.

In [None]:
bedford_inquirer_df['CA_index'] = bedford_inquirer_df['url'].str.replace('https://chroniclingamerica.loc.gov', '', regex=False)
bedford_inquirer_df['CA_index'] = bedford_inquirer_df['CA_index'].str.replace('/ocr/', '', regex=False)

In [None]:
bedford_inquirer_df['CA_index']

### 4. run Passim on dataset

You can run it with the ssb_fulltext to guide it toward SSB reprints, or not.

In [None]:
# to direct passim toward SSB reprints, add ssb_fulltext transcription to docs
# docs = [{'id': 'star_spangled_banner', 'group': 'transcribed', 'text': ssb_fulltext}]

In [None]:
docs = []

In [None]:
for row, row in bedford_inquirer_df.iterrows():
    url = row['url']
    id = row['CA_index']

    try:
        text = urlopen(url).read().decode('utf-8')
        docs.append({'id': id, 'group': 'sn83032006', 'text': text})
    except Exception as e:
        print(f"Error fetching or decoding text from {url}: {e}")

In [None]:
with open('in.json', 'w') as f:
  for d in docs:
    print(json.dumps(d), file=f)

In [None]:
!rm -r out_cluster
!seriatim in.json out_cluster >& out_cluster.err

In [None]:
def read_jsonl_file(f):
  res = []
  for line in f:
    res.append(json.loads(line))
  return res

def read_jsonl(d):
  return list(itertools.chain.from_iterable([read_jsonl_file(open(f)) for f in glob.glob(d + '/*.json')]))

In [None]:
!ls out_cluster

### 5. Review Passim output

Did it catch all the Star-Spangled Banner reprints?

In [None]:
read_jsonl('out_cluster/out.json')

Identifies 12 (debatable) SSB reprintings. Some are just fragments.