In [1]:
import json
import re
import numpy as np
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

os.chdir("..")
import utils

### Step 1: load cases

In [2]:
QUERY = "neighbor cut tree"

In [3]:
file_name = "output_{}.json".format(QUERY.replace(" ", "-"))

if os.path.exists(file_name):
    # load data
    with open(file_name, 'r') as f:
        cases = json.load(f)
else:
    # query data
    response = utils.get_request_caselaw("https://api.case.law/v1/cases/?search='{}'&full_case=TRUE".format(QUERY)).json()
    cases = response['results']
    
    i = 1 # limit to 5 requests (500 cases) because that should be more than enough
    while response['next'] and i < 5: 
        response = utils.get_request_caselaw(response['next']).json()
        cases.extend(response['results'])
        i += 1
    
    with open(file_name, 'w') as f:
        json.dump(cases, f)

Structure of `response` dict:
```
{  
    "count"    : total number of matching cases  
    "next"     : url to query for next 100 cases  
    "previous" : url to query for previous 100 cases  
    "results"  : [ (max length 100)
        {
            "name"          : **case name**
            "decision_date" : case date
            "citations"     : [
                {
                    "cite" : name of law
                    "type" : ??
                },
                ...
            ]
            "frontend_url"  : clean look at case
            "casebody"      : { (if full_case=FALSE, this field is not present)
                "status" : should equal ok
                "data"   : {
                    "opinions"    : [ (for non-supreme court cases, just one opinion)
                        {
                            "author" : name of judge
                            "type"   : indicates type of opinion (majority/dissent/etc)
                            "text"   : opinion text
                        },
                        ...
                    ]
                    "head_matter" : **case description text**
            }
        },
        ...
    ]
}
```

### Step 2: pre-processing

In [4]:
for case in cases:
    # get rid of non-ok cases
    if case['casebody']['status'] != 'ok':
        cases.remove(case)
        continue

    # get rid of \n from case description
    case['casebody']['data']['head_matter'] = case['casebody']['data']['head_matter'].replace("\n", " ")

In [5]:
# enforce case ordering and also remove duplicates
case_names, case_texts = zip(*[(case['name'], case['casebody']['data']['head_matter']) for case in cases])
case_names = list(case_names)
case_texts = list(case_texts)

### Step 3: find good measure of case similarity

In [6]:
vec = TfidfVectorizer(min_df=.05, max_df=0.8, max_features=5000, stop_words='english', norm='l2')
tfidf_matrix = vec.fit_transform(case_texts + [QUERY]).toarray()
tokens = vec.get_feature_names()
print("tokens considered by vectorizer:")
print(tokens)

tokens considered by vectorizer:
['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '27', '2d', '30', 'act', 'action', 'adjoining', 'affirmed', 'al', 'alleged', 'app', 'appeal', 'appeals', 'appellant', 'appellants', 'appellee', 'appellees', 'april', 'argued', 'assistant', 'asst', 'attorney', 'attorneys', 'atty', 'brief', 'brown', 'ca', 'case', 'cases', 'cause', 'certain', 'charge', 'charles', 'chief', 'circuit', 'city', 'company', 'corporation', 'counsel', 'county', 'court', 'cut', 'cutting', 'damages', 'david', 'dec', 'december', 'decided', 'defendant', 'defendants', 'denied', 'department', 'did', 'dist', 'district', 'div', 'division', 'does', 'error', 'et', 'evidence', 'fact', 'facts', 'february', 'feet', 'filed', 'gen', 'general', 'george', 'ground', 'having', 'held', 'injury', 'james', 'jan', 'january', 'jj', 'john', 'jones', 'jr', 'judge', 'judges', 'judgment', 'july', 'june', 'jury', 'justice', 'land', 'lands', 'law', 'liable', 'line'

In [7]:
# compute cosine similarity
query_vec = tfidf_matrix[-1]
scores = [cosine_similarity(query_vec.reshape(1,-1), doc_vec.reshape(1,-1))[0][0] for doc_vec in tfidf_matrix[:-1]]
cosine_similarities = [(k,v, case_names.index(k)) for k,v in zip(case_names, scores)]

In [8]:
# display results
results = pd.DataFrame(cosine_similarities, columns=['case_name', 'score', 'original_rank']).sort_values('score', ascending=False).reset_index(drop=True)
results.head(20)

Unnamed: 0,case_name,score,original_rank
0,"George Evans vs. Mayer Tree Service, Inc., & o...",0.439336,0
1,"Charles B. Gibson, Respondent, v. Elizabeth B....",0.354077,5
2,"Benjamin F. Edsall, Resp’t, v. John L. Howell,...",0.353357,3
3,EDSALL v. HOWELL,0.344158,4
4,"Benjamine F. Edsall, Respondent, v. John L. Ho...",0.343977,6
5,GRUBBS v. UNITED STATES,0.321304,40
6,The Chickasaw; O’Neil et al. v. Memphis & W. R...,0.294853,134
7,W. L. Keirn v. Lloyd Warfield,0.28365,14
8,"JAMES GODFREY, by JOHN GODFREY, His Next Frien...",0.277071,29
9,S. H. Buckingham v. T. G. Elliott,0.243214,2
