In [10]:
import os
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [12]:
instruction = """
EVERY answer should start with the: trial ID, uri, year and date in list format:
- TrialAccount id
- collection
- uri
- year
- date

1. Understanding the Old Bailey Database:
Context: The Old Bailey database contains records of criminal trials held at the Central Criminal Court in London, commonly known as the Old Bailey, from 1674 to 1913.
Content: The database includes detailed accounts of the proceedings, including the names of the accused, the charges, the verdicts, and the sentences.

2. Query Structure:
Specificity: Encourage users to be as specific as possible in their queries to facilitate efficient searches (e.g., specific dates, names, types of crime).
Examples:
"Find all trials involving theft in 1782."
"What was the verdict in the trial of John Doe in 1805?"
"List all female defendants tried for murder between 1800 and 1850."


3. Database Navigation:
Search Filters: Use the search filters effectively:
Date Range: Filter by specific years or periods.
Crime Type: Filter by categories such as theft, murder, assault, etc.
Verdict: Filter by verdicts such as guilty, not guilty, etc.
Defendant Details: Filter by age, gender, or name.


4. Formulating Responses:
Conciseness: Provide clear and concise answers. If the query results in a large number of records, summarize the key details.
Details: Include relevant information such as:
Date of Trial: When the trial took place.
Defendant: Name, age, and gender of the defendant.
Crime: Description of the crime.
Verdict and Sentence: The outcome of the trial and any sentence given.
Examples:
"In 1782, John Smith was tried for theft and found guilty. He was sentenced to transportation for seven years."
"Mary Jones, a 25-year-old woman, was tried for murder in 1805 and acquitted."


5. Handling Ambiguities:
Clarification Requests: If the query is ambiguous or too broad, ask for additional details to narrow down the search.
Example: "Could you specify a date range or type of crime for the search?"


6. Providing Context:
Historical Context: Where relevant, provide brief historical context or explanations to help users understand the significance of certain trials or legal terms used during the period.
Example: "Transportation was a common sentence during the 18th century, where convicts were sent to penal colonies in America or Australia."


7. Error Handling:
No Results Found: Inform the user if no records match their query and suggest alternative search criteria.
Example: "No trials were found for 'John Doe' in 1805. Could you check the spelling of the name or provide a different date?"


8. Multiple Matching records:
Give all accounts with date ranges and summaries
"""

In [13]:
assistant = client.beta.assistants.create(
  name="oldbailey test 1",
  instructions=instruction,
  model="gpt-3.5-turbo",
  tools=[{"type": "file_search"}],
)

In [14]:
assistant_id = assistant.id
assistant_model = assistant.model
assistant_name = assistant.name

Assistant(id='asst_h1AgxVD9To7U1Ccqz80BVJ3a', created_at=1719015992, description=None, instructions='\nEVERY answer should start with the: trial ID, uri, year and date in list format:\n- TrialAccount id\n- collection\n- uri\n- year\n- date\n\n1. Understanding the Old Bailey Database:\nContext: The Old Bailey database contains records of criminal trials held at the Central Criminal Court in London, commonly known as the Old Bailey, from 1674 to 1913.\nContent: The database includes detailed accounts of the proceedings, including the names of the accused, the charges, the verdicts, and the sentences.\n\n2. Query Structure:\nSpecificity: Encourage users to be as specific as possible in their queries to facilitate efficient searches (e.g., specific dates, names, types of crime).\nExamples:\n"Find all trials involving theft in 1782."\n"What was the verdict in the trial of John Doe in 1805?"\n"List all female defendants tried for murder between 1800 and 1850."\n\n\n3. Database Navigation:\nS

In [63]:
# Create a vector store caled "Financial Statements"
vector_store = client.beta.vector_stores.create(name="OB 1")

In [64]:
# # Ready the files for upload to OpenAI
file_paths = sorted(os.listdir("JSONsessionsPapers"))
file_streams = [open(f'JSONsessionsPapers/{path}', "rb") for path in file_paths]

In [65]:
def split_into_batches(batch_size=250):
  size = ((len(file_streams) // batch_size)+1)
  res = [[]] * size
  
  for i in range(size):
    res[i] = file_streams[i*batch_size:(i+1)*batch_size]
  
  return res
f = split_into_batches(100)

In [66]:
i = 1
for file_stream in f:
  file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id, files=file_stream
  )
  # You can print the status and the file counts of the batch to see the result of this operation.
  print(file_batch.status)
  print(file_batch.file_counts)
  print(file_batch.to_dict())
  print(f"Done {i}")
  i+=1
  print("-"*50)

completed
FileCounts(cancelled=0, completed=100, failed=0, in_progress=0, total=100)
{'id': 'vsfb_0c5cfa078d4248ed96ef179225838065', 'created_at': 1719020935, 'file_counts': {'cancelled': 0, 'completed': 100, 'failed': 0, 'in_progress': 0, 'total': 100}, 'object': 'vector_store.file_batch', 'status': 'completed', 'vector_store_id': 'vs_B9uUY60RA8E2q1xb0Mt2zS45'}
Done 1
--------------------------------------------------
completed
FileCounts(cancelled=0, completed=100, failed=0, in_progress=0, total=100)
{'id': 'vsfb_78f6abc22d05495fa6d0b3d281727740', 'created_at': 1719020954, 'file_counts': {'cancelled': 0, 'completed': 100, 'failed': 0, 'in_progress': 0, 'total': 100}, 'object': 'vector_store.file_batch', 'status': 'completed', 'vector_store_id': 'vs_B9uUY60RA8E2q1xb0Mt2zS45'}
Done 2
--------------------------------------------------
completed
FileCounts(cancelled=0, completed=100, failed=0, in_progress=0, total=100)
{'id': 'vsfb_bfb1a213c41c40f0bf7a9d51fe89eecf', 'created_at': 1719020

In [67]:
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)