In [None]:
import pandas as pd
import json
from datetime import datetime
import requests
import PyPDF2
import io
from dotenv import load_dotenv
import os
from openai import OpenAI

In [None]:
# Initiate constants 

URL_BASE = "https://www.ca9.uscourts.gov"
FILE_NAME_BASE = "./opinions-"
RELEVANT_COLUMNS = ['case_name', 'case_num', 'case_origin', 'judge', 'case_type', 'short_date', 'file_name']
SCRAPE_START_DATE = '2018-11-1'
SCRAPE_END_DATE = '2023-11-1'
SUMMARIZATION_PROMPT = """TASK: Summarize the following legal opinion document in 1000 words or less. In the summary, include the case name and go into detail about the opinion of the authoring judge and any precedent used or established to support the final decision.

"""

load_dotenv()

In [None]:
# Read opinion files from https://cdn.ca9.uscourts.gov/assets/Public-Data.pdf

jsons = []
for i in range(1, 5):
    file_name = FILE_NAME_BASE + str(i)
    with open(file_name) as f:
        for line in f.readlines():
            line_json = json.loads(line)
            jsons.append(line_json["Item"])
        f.close()

In [None]:
# Store relevant information in DataFrame

df = pd.DataFrame.from_records(jsons)
df = df[RELEVANT_COLUMNS]

In [None]:
# Format columns and limit the time range of cases

df_new = pd.DataFrame()
for col in df.columns:
    df_new[col] = df[col].apply(lambda x: str(x["S"]) if type(x) == dict else x)

df = df_new

df['date'] = pd.to_datetime(df['short_date'])  

mask = (df['date'] > SCRAPE_START_DATE) & (df['date'] <= SCRAPE_END_DATE)
df = df.loc[mask]

In [None]:
client = OpenAI(api_key=os.getenv("OPEN_AI_KEY"))

# Generate a 1000 word summary for the given case.
def generate_case_summary(case_name, opinion_text):
    curr_prompt = SUMMARIZATION_PROMPT + "**CASE NAME**: " + case_name + "\n**OPINION TEXT**: " + opinion_text

    curr_prompt_token_count = len(curr_prompt) * 0.25
    if curr_prompt_token_count < 16000:
        messages = []

        messages.append({"role": "user", "content": curr_prompt})

        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            messages=messages
        )
        
        overall_summary = response.choices[0].message.content

        return overall_summary      


In [None]:
# Scrape and store the opinion text and their summaries

for index, row in df.iterrows():
    if index % 100 == 0:
        print(index)
    case_name = row["file_name"]
    case_url = URL_BASE + case_name
    r = requests.get(case_url)
    if r.status_code == 200:
        f = io.BytesIO(r.content)
        reader = PyPDF2.PdfReader(f)
        pdf_content = ""
        for page in reader.pages:
            pdf_content += page.extract_text() + "\n\n"
        df.loc[index, 'opinion_text'] = pdf_content
        df.loc[index, 'case_summary'] = generate_case_summary(row["case_name"], pdf_content)

In [None]:
df = df.dropna()

In [None]:
# Store the cases in a pickle file.

df.to_pickle("./cases.pkl")  