In [4]:
# https://docs.cohere.com/reference/generate

import cohere
import os

co = cohere.Client(os.environ['COHERE_API_KEY'])

prompt = """
From the following historical document text, please grab out the following items and return with the following format
[{{
  "name": "John Doe",
  "age": 30,
  "title": "Farmer"
}},
{{<ITEM 2>}}, {{Item 3>}}, ...]

Here is the text:
{text}"""


response = co.generate(
  prompt='From the following',
)
print(response)


SyntaxError: incomplete input (3923542006.py, line 8)

In [72]:
# pip install -U langchain-community cohere
## Use Langchain https://python.langchain.com/docs/integrations/llms/cohere
# https://api.python.langchain.com/en/latest/llms/langchain_community.llms.cohere.Cohere.html

#from langchain_community.llms import Cohere
# from langchain_core.prompts import PromptTemplate

from langchain_openai import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

import json

model = OpenAI()

system_message = """
From the following historical document text, please grab out the following items and return with the following format
[{{
  "name": "John Doe",
  "age": 30,
  "title": "Farmer"
}},
{{<ITEM 2>}}, {{Item 3>}}, ...]

Return only the list of Python dictionary objects and nothing more. Make a dictionary entry for each person mentioned.

Here is the text:
{text}"""

text = "Tim Paulson was a farmer who lived in the 1800s. He was 30 years old and lived in the state of California. He had two kids, John and Jane. John was 12 and Jane was 10. They were both students."

prompt = PromptTemplate(input_variables=['text'], template=system_message)
full_prompt = prompt.format(text=text)

response_raw = model.predict(full_prompt)

response = json.loads(response_raw)

response




[{'name': 'Tim Paulson', 'age': 30, 'title': 'Farmer'},
 {'name': 'John Paulson', 'age': 12, 'title': 'Student'},
 {'name': 'Jane Paulson', 'age': 10, 'title': 'Student'}]

In [33]:
# pip install -U langchain-community cohere
## Use Langchain https://python.langchain.com/docs/integrations/llms/cohere
# https://api.python.langchain.com/en/latest/llms/langchain_community.llms.cohere.Cohere.html

from langchain_community.llms import Cohere
from langchain_core.prompts import PromptTemplate

# from langchain_openai import OpenAI
# from langchain.prompts import PromptTemplate

import json

model = Cohere()

system_message = """
From the following historical document text, please grab out the following items and return with the following format
[{{
  "name": "John Doe",
  "age": 30,
  "title": "Farmer"
}},
{{<ITEM 2>}}, {{Item 3>}}, ...]

Return only the list of Python dictionary objects and nothing more. Make a dictionary entry for each person mentioned.

Here is the text:
{text}

NO EXTRA TEXT BUT THE LIST!!!!"""

text = "Tim Paulson was a farmer who lived in the 1800s. He was 30 years old and lived in the state of California. He had two kids, John and Jane. John was 12 and Jane was 10. They were both students."

prompt = PromptTemplate(input_variables=['text'], template=system_message)
full_prompt = prompt.format(text=text)

response_raw = model.predict(full_prompt)

# response = json.loads(response_raw)

# Regex get the text within ```json<TEXT>``` and return it as a list of dictionaries
import re
pattern = r"```json(.*)```"
match = re.findall(pattern, response_raw, re.MULTILINE | re.DOTALL)[0]

response = json.loads(match)

response








[{'name': 'Tim Paulson', 'age': 30, 'title': 'Farmer'},
 {'name': 'John', 'age': 12, 'title': 'Student'},
 {'name': 'Jane', 'age': 10, 'title': 'Student'}]

## Process contracts.csv

In [58]:
import pandas as pd

base_directory = './input'

df = pd.read_csv(f"{base_directory}/contract-records.csv")
df = df.replace("_x000D_", "", regex=True)

# remap values that have Apprenticeship Agreement to Apprenticeship Agreements
df['sub_category'] = df['sub_category'].replace('Apprenticeship Agreements', 'Apprenticeship Agreement')

df['sub_category'].value_counts()

apprenticeship_agreements = df[df['sub_category'] == 'Apprenticeship Agreement']


In [73]:
import ast

def from_same_document(document1, document2):
    system_message = """
    Your job is to determine whether the following two documents are from the same document or not. Please return a boolean value of True or False. 
    One of the key things to look out for is the same names, locations, or dates referred to in both documents.

    Document 1:
    {document1}

    Document 2:
    {document2}
    """

    prompt = PromptTemplate(input_variables=['document1', 'document2'], template=system_message)
    full_prompt = prompt.format(document1=document1['transcription_text'], document2=document2['transcription_text'])

    print(document1)

    print("\nDOC 2")
    print(document2)

    response_raw = model.predict(full_prompt)

    print(response_raw.strip())

    return ast.literal_eval(response_raw.strip())


In [74]:
apprenticeship_agreements.head()

Unnamed: 0,project_id,category,sub_category,transcription_text,document_url
0,11406,Contracts,Apprenticeship Agreement,Copy\n\nAgreement of Apprenticeship\nBu \nMrs....,https://transcription.si.edu/transcribe/11406/...
1,11406,Contracts,Apprenticeship Agreement,[H 132 ENCL] \n\nCopy\n\nAgreement of Apprenti...,https://transcription.si.edu/transcribe/11406/...
2,11406,Contracts,Apprenticeship Agreement,is agreed on the part of the party of the firs...,https://transcription.si.edu/transcribe/11406/...
3,11406,Contracts,Apprenticeship Agreement,continuance of this agreement & finally that t...,https://transcription.si.edu/transcribe/11406/...
4,15369,Contracts,Apprenticeship Agreement,"[[preprinted]]\nBureau of Refugees, Freedman a...",https://transcription.si.edu/transcribe/15369/...


In [75]:
merge_id = 0
for i in range(0, len(apprenticeship_agreements), 2):
    document1 = apprenticeship_agreements.iloc[i]
    document2 = apprenticeship_agreements.iloc[i+1] if i + 1 < len(apprenticeship_agreements) else None

    if document2 is not None:
        if from_same_document(document1, document2):
            document1['merge_id'] = merge_id
            document2['merge_id'] = merge_id
        else:
            merge_id += 1

    if i > 100:
        break


project_id                                                        11406
category                                                      Contracts
sub_category                                   Apprenticeship Agreement
transcription_text    Copy\n\nAgreement of Apprenticeship\nBu \nMrs....
document_url          https://transcription.si.edu/transcribe/11406/...
Name: 0, dtype: object

DOC 2
project_id                                                        11406
category                                                      Contracts
sub_category                                   Apprenticeship Agreement
transcription_text    [H 132 ENCL] \n\nCopy\n\nAgreement of Apprenti...
document_url          https://transcription.si.edu/transcribe/11406/...
Name: 1, dtype: object

True
project_id                                                        11406
category                                                      Contracts
sub_category                                   Apprenticeship Agreement
trans

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document1['merge_id'] = merge_id
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document1['merge_id'] = merge_id
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document2['merge_id'] = merge_id
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  document2['merge_id'] = merge_id



False
project_id                                                        15369
category                                                      Contracts
sub_category                                   Apprenticeship Agreement
transcription_text    [[preprinted]]\nBureau of Refugees, Freedman a...
document_url          https://transcription.si.edu/transcribe/15369/...
Name: 4, dtype: object

DOC 2
project_id                                                        15369
category                                                      Contracts
sub_category                                   Apprenticeship Agreement
transcription_text    Jim Tew reports that Dick Holmes, Capt� bound ...
document_url          https://transcription.si.edu/transcribe/15369/...
Name: 5, dtype: object

False
project_id                                                        15369
category                                                      Contracts
sub_category                                   Apprenticeship Agreeme

IndentationError: unexpected indent (<unknown>, line 3)