In [1]:
import json
import re
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

os.chdir("..")
import utils

### Step 1: load cases

In [2]:
QUERY = "neighbor cut tree"

In [3]:
file_name = "output_{}.json".format(QUERY.replace(" ", "-"))

if os.path.exists(file_name):
    # load data
    with open(file_name, 'r') as f:
        cases = json.load(f)
else:
    # query data
    response = utils.get_request_caselaw("https://api.case.law/v1/cases/?search='{}'&full_case=TRUE".format(QUERY)).json()
    cases = response['results']
    
    i = 1 # limit to 5 requests (500 cases) because that should be more than enough
    while response['next'] and i < 5: 
        response = utils.get_request_caselaw(response['next']).json()
        cases.extend(response['results'])
        i += 1
    
    with open(file_name, 'w') as f:
        json.dump(cases, f)

Structure of `response` dict:
```
{  
    "count"    : total number of matching cases  
    "next"     : url to query for next 100 cases  
    "previous" : url to query for previous 100 cases  
    "results"  : [ (max length 100)
        {
            "name"          : **case name**
            "decision_date" : case date
            "citations"     : [
                {
                    "cite" : name of law
                    "type" : ??
                },
                ...
            ]
            "frontend_url"  : clean look at case
            "casebody"      : { (if full_case=FALSE, this field is not present)
                "status" : should equal ok
                "data"   : {
                    "opinions"    : [ (for non-supreme court cases, just one opinion)
                        {
                            "author" : name of judge
                            "type"   : indicates type of opinion (majority/dissent/etc)
                            "text"   : opinion text
                        },
                        ...
                    ]
                    "head_matter" : **case description text**
            }
        },
        ...
    ]
}
```

### Step 2: pre-processing

In [7]:
for case in cases:
    # get rid of non-ok cases
    if case['casebody']['status'] != 'ok':
        cases.remove(case)
        continue

    # get rid of \n from case description
    case['casebody']['data']['head_matter'] = case['casebody']['data']['head_matter'].replace("\n", " ")
    
case['casebody']

{'data': None, 'status': 'error_limit_exceeded'}

In [5]:
# enforce case ordering and also remove duplicates
case_names, case_texts = zip(*[(case['name'], case['casebody']['data']['head_matter']) for case in cases])
case_names = list(case_names)
case_texts = list(case_texts)

TypeError: 'NoneType' object is not subscriptable

### Step 3: find good measure of case similarity

In [None]:
vec = TfidfVectorizer(min_df=.05, max_df=0.8, max_features=5000, stop_words='english', norm='l2')
tfidf_matrix = vec.fit_transform(case_texts + [QUERY]).toarray()
tokens = vec.get_feature_names()
print("tokens considered by vectorizer:")
print(tokens)

In [None]:
# compute cosine similarity
query_vec = tfidf_matrix[-1]
scores = [cosine_similarity(query_vec.reshape(1,-1), doc_vec.reshape(1,-1))[0][0] for doc_vec in tfidf_matrix[:-1]]
cosine_similarities = [(k,v, case_names.index(k)) for k,v in zip(case_names, scores)]
print(query_vec)

In [None]:
# display results
results = pd.DataFrame(cosine_similarities, columns=['case_name', 'score', 'original_rank']).sort_values('score', ascending=False).reset_index(drop=True)
results.head(20)