In [145]:
import requests
import os
import utils
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy import linalg as LA
import math

# For text summarization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import string 

# Display Pretty JSON

In [2]:
def pretty_json(dictionary):   
    formatted_json = json.dumps(dictionary, indent=4, sort_keys=True) # type 'str'
    print(formatted_json)

# Using the CAPAPI

The Caselaw Access Project API, also known as CAPAPI, serves all official US court cases published in books from 1658 to 2018. The collection includes over six million cases scanned from the Harvard Law Library shelves.

In [152]:
"""
Returns either 'appeals' or 'trial'
"""
def get_court(case_json):
    text = case_json["casebody"]["data"]["opinions"][0]['text']
    stem = PorterStemmer()
    for sentence in sent_tokenize(text):
        for word in word_tokenize(sentence):
            root = stem.stem(word)
            if root == stem.stem('appell') or root == stem.stem('remanded'):
                return 'appeals'
    return 'trial'
                

"""
case_json: request response object
Returns court id
"""
def get_name(case_json):
    return case_json["name"]

"""
case_json: request response object
Returns court id
"""
def get_id(case_json):
    return case_json["id"]


"""
case_json: request response object
Returns full-text of case
"""
def get_full_text(case_json):
    return case_json["casebody"]["data"]["opinions"][0]['text']

### Retreive single case by ID

In [153]:
single_case = utils.get_request_caselaw('https://api.case.law/v1/cases/1589229/?full_case=true').json() # actually a 'dict'
pretty_json(single_case)
get_court(single_case)

{
    "casebody": {
        "data": {
            "attorneys": [
                "Harold Sheats, Martin H. Peabody, for plaintiff in error.",
                "Wilson, Branch ,& Barwick, M. Cook Barwick, Thomas S. Bentley, contra."
            ],
            "corrections": "",
            "head_matter": "38245.\nFULTON COUNTY CIVIL COURT v. ELZEY.\nDecided April 14, 1960.\nHarold Sheats, Martin H. Peabody, for plaintiff in error.\nWilson, Branch ,& Barwick, M. Cook Barwick, Thomas S. Bentley, contra.",
            "judges": [
                "Townsend, Carlisle, 'and Frankum, JJ., concur."
            ],
            "opinions": [
                {
                    "author": "Gardner, Presiding Judge.",
                    "text": "Gardner, Presiding Judge.\nThis court has, on innumerable occasions, held that where an employee is injured in the scope of his employment and the evidence before the State Board of Workmen\u2019s Compensation reflects that such was the case, an appellate c

'appeals'

### Phrase Search

In [154]:
case_json = utils.get_request_caselaw('https://api.case.law/v1/cases/?search="car accident"&full_case=true').json()
case_json

{'count': 15015,
 'next': 'https://api.case.law/v1/cases/?cursor=eyJwIjogWzEwLjg0NDQ2MywgMTIwOTYwMDAwMDAwMCwgMzY2ODE4OF19&full_case=true&search=%22car+accident%22',
 'previous': None,
 'results': [{'id': 586727,
   'url': 'https://api.case.law/v1/cases/586727/',
   'name': 'ANTONIO FABIANO, A MINOR, BY HIS NEXT FRIEND GIUSTINA FABIANO AND GIUSTINA FABIANO, PLAINTIFFS-APPELLEES, v. ROBERT BERCKES, DEFENDANT-APPELLANT',
   'name_abbreviation': 'Fabiano ex rel. Fabiano v. Berckes',
   'decision_date': '1923-07-14',
   'docket_number': '',
   'first_page': '406',
   'last_page': '407',
   'citations': [{'type': 'official', 'cite': '1 N.J. Misc. 406'}],
   'volume': {'volume_number': '1',
    'barcode': '32044078458874',
    'url': 'https://api.case.law/v1/volumes/32044078458874/'},
   'reporter': {'full_name': 'New Jersey Miscellaneous Reports',
    'id': 324,
    'url': 'https://api.case.law/v1/reporters/324/'},
   'court': {'name_abbreviation': 'N.J.',
    'slug': 'nj-3',
    'id': 24657

### Full-text Search

In [155]:
utils.get_request_caselaw('https://api.case.law/v1/cases/?search=university published false article libel&full_case=true&ordering=decision_date').json()

{'count': 981,
 'next': 'https://api.case.law/v1/cases/?cursor=eyJwIjogWy01NTc4ODQ4MDAwMDAsIDExMzM5MDM2XX0%3D&full_case=true&ordering=decision_date&search=university+published+false+article+libel',
 'previous': None,
 'results': [{'id': 6597062,
   'url': 'https://api.case.law/v1/cases/6597062/',
   'name': 'UNITED STATES v. COOPER',
   'name_abbreviation': 'United States v. Cooper',
   'decision_date': '1800-04-30',
   'docket_number': '',
   'first_page': '631',
   'last_page': '646',
   'citations': [{'type': 'official', 'cite': '25 F. Cas. 631'},
    {'type': 'parallel', 'cite': 'Whart. St. Tr. 659'}],
   'volume': {'volume_number': '25',
    'barcode': 'NOTALEPH001463',
    'url': 'https://api.case.law/v1/volumes/NOTALEPH001463/'},
   'reporter': {'full_name': 'Federal Cases',
    'id': 942,
    'url': 'https://api.case.law/v1/reporters/942/'},
   'court': {'name_abbreviation': 'C.C.D. Pa.',
    'slug': 'ccd-pa',
    'id': 9999,
    'name': 'United States Circuit Court for the Dis

In [156]:
test_case_response.json()

NameError: name 'test_case_response' is not defined

### Add a Date Range Filter

In [None]:
'&decision_date_min=1990-12-30&decision_date_max=1995-12-30'

# Useful Data Structures

In [9]:
# The parts of the response we are interested in are the case name, case text, judge opinions, 
# and laws / regulations cited in the case.

data = list()
for result in case_json['results']:
    
    case_id = get_id(result)
    case_name = get_name(result)
    case_text = get_full_text(result)
    
    result_dict = {'case_id' : case_id, 'case_name' : case_name, 'text' : case_text}
    data.append(result_dict)

In [151]:
trial_count = 0
appeals_count = 0
for result in case_json['results']:
    court = get_court(result)
    if court == 'appeals':
        appeals_count += 1
    else:
        trial_count += 1
        
print("Number of trial cases: %d" % trial_count)
print("Number of appeals cases: %d" % appeals_count)

Number of trial cases: 54
Number of appeals cases: 46


In [11]:
num_cases = case_json['count']
num_cases

15015

In [12]:
print("Loaded {} case transcripts".format(num_cases))
print("Each case transcript is a dictionary with the following keys...")
print(data[0].keys())


# Here, we will assign an index for each movie_id. This index will help us access data in numpy matrices.
case_id_to_index = {case_id:index for index, case_id in enumerate([d['case_id'] for d in data])}

# We will also need a dictionary maping movie names to movie ids
case_name_to_id = {name:case_id for name, case_id in zip([d['case_name'] for d in data],
                                                     [d['case_id'] for d in data])}
case_id_to_name = {v:k for k,v in case_name_to_id.items()}

# and because it might be useful...
case_name_to_index = {name:case_id_to_index[case_name_to_id[name]] for name in [d['case_name'] for d in data]}
case_index_to_name = {v:k for k,v in case_name_to_index.items()}

Loaded 15015 case transcripts
Each case transcript is a dictionary with the following keys...
dict_keys(['case_id', 'case_name', 'text'])


# Judgement Metric

In [69]:
"""
Returns true if a string contains a digit character, False otherwise
word: a str
"""
def contains_digit(line):
    return any(char.isdigit() for char in line)

def contains_punctuation(line):
    return any(char in string.punctuation for char in line)
    

"""
Returns a term-frequency dict with (term, frequency) key-value pairs
text_string: a str to create the term-freq dict from
"""
def create_tf_dict(text_string):
    
    # Remove stop words
    text_string = text_string.lower()
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    # Reduce words to their root form
    stem = PorterStemmer()
    
    # Create dictionary for the word frequency table
    tf_dict = dict()
    for wd in words:
        # Remove puncutation by turning puncutation to ''
        wd = wd.translate(str.maketrans('', '', string.punctuation))
        # Stem
        wd = stem.stem(wd)
        
        if wd in stop_words or wd == '':
            continue
        if wd in tf_dict:
            tf_dict[wd] += 1
        else:
            tf_dict[wd] = 1
    
    return tf_dict


In [144]:
stem = PorterStemmer()
word = stem.stem('appeal')
word

'appeal'

In [96]:
def create_sentence_scores(sentences, tf_dict, n_chars=10):   

    # Algorithm for scoring a sentence by its words
    sentence_weight_dict = dict()

    for sentence in sentences:
        num_words = (len(word_tokenize(sentence)))
        num_words_minus_stop_words = 0
        first_n_chars = sentence[:n_chars]
        
        for word in tf_dict:
        
            if word in sentence.lower():
                num_words_minus_stop_words += 1
                
                if not (contains_digit(first_n_chars) or contains_punctuation(first_n_chars)):

                    if first_n_chars in sentence_weight_dict:
                        sentence_weight_dict[first_n_chars] += tf_dict[word]
                    else:
                        sentence_weight_dict[first_n_chars] = tf_dict[word]
        
        if not (contains_digit(first_n_chars) or contains_punctuation(first_n_chars)):
            # Additive smoothing to avoid infinities
            sentence_weight_dict[first_n_chars] = ((sentence_weight_dict[first_n_chars]+1) / (num_words_minus_stop_words+1))
      
    return sentence_weight_dict


In [97]:
def mean_sentence_score(sentence_weight_dict):
   
    # Calculating the average score for the sentences
    sum_weights = 0
    for sentence in sentence_weight_dict:
        sum_weights += sentence_weight_dict[sentence]

    # Getting sentence average value from source text
    average_score = (sum_weights / len(sentence_weight_dict))

    return average_score

In [98]:
def create_summary(sentences, sentence_weight, threshold, n_chars=10):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:n_chars] in sentence_weight and sentence_weight[sentence[:n_chars]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

In [132]:
def case_summary(case_text):
    
    # creating a tf dictionary
    tf_dictionary = create_tf_dict(case_text)

    # tokenize sentences
    sentences = sent_tokenize(case_text)

    # algorithm for scoring a sentence by its words
    sentence_scores = create_sentence_scores(sentences, tf_dictionary)

    #getting the threshold
    threshold = mean_sentence_score(sentence_scores)

    #producing the summary
    article_summary = create_summary(sentences, sentence_scores, 1.3 * threshold)

    return article_summary

# Text Summarization

In [133]:
from IPython.display import display, Markdown, Latex

In [136]:
## Original Text
text = data[3]['text']
display(Markdown(text))

|,THOMAS F. DALEY, Judge.
Appellant, Direct General Insurance Company of Louisiana (Direct General), filed a Petition for Concursus in this matter and deposited into the registry of the Court the sum of One Hundred Thousand ($100,000.00) Dollars, representing the “per person” limits of the policy of automobile liability insurance it issued to Delecia Allen, in connection with a motor vehicle accident that occurred in St. James Parish on September 26, 2001, which accident resulted in the death of Eugene Keller.
The biological children and heirs of Eugene Keller, Jr. (Kellers), filed an Answer to the concursus petition and a Reconven-tional Demand in which they claimed that they were entitled to the entire Three Hundred Thousand ($300,000.00) Dollars “per accident” limits of the policy issued by Direct General.
Both Direct General and the Kellers each filed a Motion for Summary Judgment on the issue of whether the “per person” limits of One Hundred Thousand ($100,000.00) Dollars or the “per accident” limits of Three Hundred | .¡Thousand ($300,000.00) Dollars applied to the claims of the Kellers under the policy.
On May 16, 2002, the trial court entered judgment granting the Motion for Summary Judgment of the Kellers and denying the Motion for Summary Judgment of Direct General, in effect finding that the Three Hundred Thousand ($300,000.00) Dollars “per accident” limits of the policy applied.
In the Reasons for Judgment issued on June 17, 2002, the trial judge stated, “the three children of Eugene Keller, Jr. each suffered individual injuries, including the loss of services, as a result of the death of their father.” Further, the trial court stated that “each child suffered bodily injury, as defined by the policy, caused by one car accident.” The judge ordered Direct General Insurance Company to pay policy limits of Three Hundred Thousand ($300,000.00) Dollars to the Kellers.
Direct General appeals the decision of the trial court. Specifically, the trial court’s Order that Direct General pay policy limits of Three Hundred Thousand ($300,000.00) Dollars to the Kellers. Direct General requests that the trial court’s decision to deny its Motion for Summary Judgment be reversed, that the trial court’s Order that Direct General pay policy limits of Three Hundred Thousand ($300,000.00) Dollars to the Kellers be reversed, and the Direct General Motion for Summary Judgment be granted, finding the applicable limits of the policy to be One Hundred Thousand ($100,000.00) Dollars.
The policy of automobile liability insurance issued by Direct General to Delecia Allen provides:
Definitions
Damages means the cost of compensating those who suffer bodily injury or property damage from a car accident.
| ^Liability
Bodily Injury Covered By This Insurance. This insurance covers bodily injury, including loss of services, sickness, disease or death that results from the injury, caused by a car accident and suffered by any person. Bodily injury does not mean a person’s emotional injury or mental anguish which resulted from witnessing an injury to another person or which otherwise resulted from the injury to another person.
Limits of Liability
The limit of liability shown in the declarations for “each person” for bodily injury liability is our maximum limit of liability for all damages for bodily injury to any one person, including damages for care, loss of services or death, arising out of bodily injury sustained by any one person in any one car accident. Subject to this limit for “each person” the limit of liability shown on the declarations for “each accident for bodily injury liability is our maximum limit of liability for all damages resulting from bodily injury sustained by two or more persons in any one car accident.
The accident, which forms the basis of this suit, involved bodily injury to and the death of Eugene Keller, Jr.
Appellants argue that according to the precise terms of the insurance policy the limit for “each person applies to all claims resulting from Eugene Keller, Jr.’s bodily injury.” They point out that the policy specifically states that bodily injury does not mean a person’s emotional injury or mental anguish which resulted from witnessing an injury to another person or which otherwise resulted from the injury to another person. Appellants assert that the damages that the Kellers children are entitled to all involve emotional or mental anguish that resulted from injury to Mr. Keller.
The Kellers children argue that because the terms of the policy include “loss of services” within the definition of “Bodily Injury Covered”, that each of Mr. Keller’s children had an individual claim. The policy states, “This insurance covers bodily injury, including loss of services, sickness, disease or death that results from the injury, caused by a car accident and suffered by any person.”
|sThe Kellers assert that the three children of Eugene Keller suffered individual injuries as a result of his death. These injuries were caused by one car accident; therefore, the policy provides coverage for the injuries the children suffered, i.e., “bodily injury, including loss of services, sickness, disease, or death that resulted from the car accident, as the children’s injuries fit within the policy provision of injuries “suffered by any person”.
In this case, each child suffered bodily injury as defined by the policy, (ie. Loss of services) because of a car accident. All three of the children’s injuries occurred or arose from the car accident that killed their father. These individual injuries to the children entitle each of them to a per person limit on the policy, because of the following provision: “limit liability for all damages resulting from bodily injury sustained by two or more persons in any one car accident.”
Where the language of a policy provision is subject to two reasonable interpretations, then the interpretation, which favors coverage, must be applied, Crabtree v. State Farm Ins. Co., 632 So.2d 736 (La.1994), 745. Since the policy provisions in Crabtree are essentially the same as the provisions in this case the same analysis leads us to the conclusion that under the particular language of this policy each child is entitled to a per person limit of One Hundred Thousand ($100,000.00) Dollars for a total of Three Hundred Thousand ($300,000.00) Dollars.
All parties agree that these plaintiffs must establish the amount of damages to which each is entitled. Thus, the trial court’s ruling on the Motion for Summary Judgment is affirmed, the trial court award of damages is reversed, and the matter is remanded back to the trial court where the case will proceed to trial, so plaintiffs can prove the amount of damages each child suffered as a result of their individual injuries.
AFFIRMED IN PART; REVERSED IN PART; AND REMANDED.

In [135]:
x = case_summary(text)
display(Markdown(x))

 The Fund’s sub-rogation interests, to the full extent of benefits paid or due as a result of the occurrence causing the injury or illness, shall next be deducted. The Fund’s sub-rogation interest will extend to all amounts recovered irrespective of how they are denominated in the settlement of judgment. Relevant here, the Plan also provides for future medical expenses as follows:
Section 12.3 Settlement or Recovery
[...]
Once a settlement is reached, additional bills cannot be submitted with respect to the same injury. The extent of her injury is disputed; Plaintiff contends that the fall left an “open wound,” but the Fund disputes this characterization, pointing to journal entries stating that it resulted in a “nice scratch.” (PL’s 56.1 ¶23; Def.’s 56.1 Resp. Relevant here, the EOBs included the following explanation for the denial: “No benefits- released—services related* to closed third party liability file” and “No further plan benefits for related illness/injury due to previous injury for which- settlement was obtained.” (Defi’s 56.1 Resp.' The spinal cord stimulating system was explanted on September 28, 2015. The Plan reviewed the letter but did not change its assessment that the Claims were “[djirectly related- to the implant placed in 2013,” and thus not covered by the Plan. In its submission to the Medical Review Institute of America, the Fund wrote that “the claimant indicates she had a fall down the stairs that caused the battery pack infection,” noted in uppercase, text. The spinal cord stimulator (SCS) was- implanted specifically to treat the patient’s pain that developed due to her CRPS. The Trustees denied the appeal and the Fund issued a ’final internal adverse benefit determination, which concluded that “complications from the explantation of the spinal cord stimulator (SCS) and subsequent admission to Rush University are related to the implanted SCS placed bn 9/27/13 and the motor vehicle accident (MVA) on 4/20/13.” (Pl.’s’56.1 Resp. The denial letter cites to Section 12.3 of the Plan excluding charges for the “same injury” and states that Sadowski’s personal injury attorney was warned that the Plan would pay no benefits for “injuries attributed to” the Car Accident after settlement. The denial explains that Sa-dowski “developed an infection of the implant/battery pack and on September 28, 2015 went to the Ambulatory Surgical center at Rush for removal of the Stimulator unit.... Because the charges were directly related to the implant (covered by a settled third party claim) [sic] The Claims associated with the removal of the implant and subsequent hospitalization, were denied according to section 12;3 of the plan.” (Id.) The denial relies heavily on Dr. Kit-telberger’s medical review, concluding that the injuries were “related.” (Id.) Sadowski’s attorney indicates there was a fall down the stairs which caused the infection,” but it does not address Dr. Lube-now’s letter, statements at the hospital, or Sadowski’s journal entries. Standard of Review under ERISA
“A court reviews a plan administrator’s denial of benefits de novo unless the plan gives the administrator discretionary1 authority to determine eligibility for benefits.” Hackett v. Xerox Corp. An administrator’s determination may be arbi-. These documents are “given primary effect and strictly -enforced and plan administrators must adhere to ‘the bright-line requirement to follow plan documents in distributing benefits.’” Young v. Verizon’s Bell Atl. Where administrators “controvert the plain meaning of [an ERISA] plan, their actions are arbitrary and capricious.” Swaback v. Am. Laborers’ Pension Fund v. Heinz, 541 U.S. 739, 743, 124 S.Ct. The question before the Court is whether the Fund’s denial of Sadowski’s Claims under Section 12.3 of the Plan was arbitrary and capricious. The Fund found that “[t]he complications from the explan-tation of the spinal cord stimulator (SCS) and subsequent admission to Rush University [were] related to the implanted SCS placed on 9/27/13 and the motor vehicle accident (MVA) on 4/20/13.” (Def.’s 56.1 ¶ 20; PL’s 56.1 Resp. It is unclear whether the Fund’s denial of benefits was based on (1) its interpretation of the Plan language that the medical expenses at issue were “with respect to the same injury” because the Claims involved the spinal cord stimulator, or (2) its factual conclusion that the infection was not caused by Sadowski’s fall down the stairs, but rather by the Car Accident. “Those principles require that Plan terms be interpreted in ‘an ordinary and popular sense, as they would, be understood by a person of average intelligence and experience.’ ” Id. The ordinary and popular meaning of “same” is a narrow one, being synonymous with identical, and therefore Section 12.3’s “same injury” language is fairly read as excluding bills with respect to an “identical” “hurt or loss sustained by” Sadowski in the Car Accident. Although this Court will uphold an administrator’s decision if it “is based on a reasonable explanation of relevant plan documents,” an interpretation that contradicts the plain meaning of a Plan’s language does not meet this threshold. The Plan’s “same injur/’ language necessarily excludes a separate, independent injury from the exclusion provision. Injuries resulting from two separate, independent events are mot reasonably read as the “same.” See, Schane v. Int’l Bhd. He settled a worker’s compensation claim and a third-party claim related to the car accident. The administrator denied the plaintiffs claim for benefits based on the plan’s workers’ compensation exclusion. The plain meaning, of Section 12.3’s “same injury” language is to exclude medical expenses causally related to the Car Accident. Laborers’ Pension Fund, 541 U.S. at 743, 124 S.Ct. subsequent, inde pendent injury, that affected Sadowski’s spinal cord stimulator. This reads “same injury” far too broadly. The Fund’s denial will be upheld if it “offer[s] a reasoned explanation, based on the evidence, for a particular outcome.” Edwards, 639 F.3d at 360 (quoting Hess, 274 F.3d at 461). In the face of this evidentiary record, the Fund argues that the evidence “does not mandate a conclusion that the fall was the sole cause of the infection.” (See, Def.’s Mem. The Fund must “articulate a rational connection” between the evidence and its determination that the Car Accident caused the infection rather than her fall. The Fund “admits that [Sadowski] slipped and fell down the stairs” approximately a month before the surgery. The Fund “admits that Ms. Sadowski’s husband told an admitting physician about ‘possibly minor trauma to the area prior to the onset of fevers and pain’” during Sadowski’s admission to Rush University’s Intensive Care Unit, as recorded in contemporaneous medical records. On the face of his report, the expert’s reasoning does not rely on medical evidence. The Fund’s conclusion that the infection was caused by the Car Accident several years prior—given the evidence provided by Sadowski, the lack of evidence to the contrary, and the insufficient reasoning underpinning Dr. Kittelberger’s report— fails to “offer a reasoned explanation, based on the evidence, for a particular outcome.” Edwards, 639 F.3d at 360 (quoting Hess, 274 F.3d at 461). After considering all of the evidence before the Fund’s Trustees, the Fund’s decision that Sadow-ski’s Claims were caused by the Car Accident was arbitrary and capricious. ⅜ ⅝
In sum, the Fund acted arbitrarily and capriciously by interpreting the Plan’s “same injury” language to include medical expenses caused by a separate, independent injury and/or by failing to provide a rational basis for rejecting Sadowski’s factual evidence concerning the cause of the infection. Public policy supports this Court’s conclusion. It is unlikely that future injuries to Sadowski’s spinal cord stimulator caused by a separate, independent injury would reach the threshold of “reasonably certain[ty]” required for an award of damages against the car accident tort-feasor.

In [131]:
n_feats = 5000
doc_by_vocab = np.empty([len(data), n_feats])

def build_vectorizer(max_features, stop_words, max_df=0.8, min_df=10, norm='l2'):
    """Returns a TfidfVectorizer object with the above preprocessing properties.
    
    Note: This function may log a deprecation warning. This is normal, and you
    can simply ignore it.
    
    Params: {max_features: Integer,
             max_df: Float,
             min_df: Float,
             norm: String,
             stop_words: String}
    Returns: TfidfVectorizer
    """
    # YOUR CODE HERE
    vectorizer = TfidfVectorizer(max_features = max_features, stop_words = stop_words, min_df = min_df, max_df = max_df, norm = norm)
    return vectorizer