In [2]:
import json
import re
import numpy as np
import os

os.chdir("..")
import utils

In [77]:
# <STEP 1> load cases

# TODO: using full case hits api limit quickly, is case metadata enough?
# TODO: get unlimited API access
response = utils.get_request_caselaw("https://api.case.law/v1/cases/?search='cornell university'&full_case=TRUE").json() 

In [78]:
with open('output.json', 'w') as f:
    json.dump(response, f)

Structure of `response` dict:
```
{  
    "count"    : total number of matching cases  
    "next"     : url to query for next 100 cases  
    "previous" : url to query for previous 100 cases  
    "results"  : [ (max length 100)
        {
            "name"          : **case name**
            "decision_date" : case date
            "citations"     : [
                {
                    "cite" : name of law
                    "type" : ??
                },
                ...
            ]
            "frontend_url"  : clean look at case
            "casebody"      : { (if full_case=FALSE, this field is not present)
                "status" : should equal ok
                "data"   : {
                    "opinions"    : [ (for non-supreme court cases, just one opinion)
                        {
                            "author" : name of judge
                            "type"   : indicates type of opinion (majority/dissent/etc)
                            "text"   : opinion text
                        },
                        ...
                    ]
                    "head_matter" : **case description text**
            }
        },
        ...
    ]
}
```

In [79]:
cases = response['results']

In [80]:
# <STEP 2> pre-processing

for case in cases:
    # get rid of non-ok cases
    if case['casebody']['status'] != 'ok':
        cases.remove(case)
        continue

    # get rid of \n from case description
    case['casebody']['data']['head_matter'] = case['casebody']['data']['head_matter'].replace("\n", " ")

In [108]:
# <STEP 3> set up data structures

# key=case name, value=case text 
cases_dict = {case['name']: case['casebody']['data']['head_matter'] for case in cases if case['casebody']['status'] == 'ok'}

In [114]:
# get total number of tokens
total_tokens = {}
for text in cases_dict.values():
    for token in [x for x in re.findall(r"[a-z]+", text.lower())]:
        try:
            total_tokens[token] += 1
        except KeyError:
            total_tokens[token] = 1
            
# filter tokens that appear less than 10 times
total_tokens = {k:v for k,v in total_tokens.items() if v > 10}

In [115]:
# useful data structures
token_to_idx_dict = {v:k for k,v in enumerate(total_tokens.keys())}
valid_tokens = set(total_tokens.keys())

In [116]:
# 2d array of token frequency per case
cases_tokens = np.zeros((len(cases), len(total_tokens)))
for idx, (case_name, case_text) in enumerate(cases_dict.items()):
    for token in [x for x in re.findall(r"[a-z]+", case_text.lower())]:
        if token in valid_tokens:
            cases_tokens[idx][token_to_idx_dict[token]] += 1
    
cases_tokens

array([[98., 98., 14., ...,  0.,  0.,  0.],
       [ 5.,  5.,  1., ...,  0.,  0.,  0.],
       [ 4.,  5.,  3., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])