In [6]:
import requests
import os
import utils
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from numpy import linalg as LA
import math
from IPython.display import display, Markdown, Latex
# For text summarization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import string 

# Load Output.json File

In [7]:
with open("output.json") as f:
    data = json.loads(f.readlines()[0])
data

[{'case_name': 'Thayer vs. Wright',
  'case_summary': 'Thayer vs. Wright. If one place a fence upon a school house lot within the exterior line, the trustees ot the district may lawfully remove and place it upon the line. And if they should convert the materials to their own use, they would not be liable to an action at the suit of the party who had placed the fence upon the lot, because, being a fixture, the fence would belong to the owner of the soil. Semble. Per Bronson, C. J. Error to the Orleans common pleas. Thayer sued Wright before a justice in trover for a quantity of rails; and the case was substantially as follows: In 1835, the plaintiff leased a lot 62 feet by 40, in the northwest corner of his farm, where two roads met, to the trustees of school district No. 24, in Murray and Barry. The lease was in perpetuity; but the property was to revert when it ceased to be used for a school house. A school house was built on the south side of the lot, fronting the east, with a privy 

# Display Pretty JSON

In [2]:
def pretty_json(dictionary):   
    formatted_json = json.dumps(dictionary, indent=4, sort_keys=True) # type 'str'
    print(formatted_json)

# Using the CAPAPI

The Caselaw Access Project API, also known as CAPAPI, serves all official US court cases published in books from 1658 to 2018. The collection includes over six million cases scanned from the Harvard Law Library shelves.

### Retreive single case by ID

In [183]:
#single_case = utils.get_request_caselaw('https://api.case.law/v1/cases/1589229/?full_case=true').json() # actually a 'dict'

### Phrase Search

In [184]:
#case_json = utils.get_request_caselaw('https://api.case.law/v1/cases/?search="car accident"&full_case=true').json()

### Full-text Search

In [164]:
#utils.get_request_caselaw('https://api.case.law/v1/cases/?search=university published false article libel&full_case=true&ordering=decision_date').json()

### Add a Date Range Filter

In [166]:
#'&decision_date_min=1990-12-30&decision_date_max=1995-12-30'

'&decision_date_min=1990-12-30&decision_date_max=1995-12-30'

In [34]:
def get_appeals_decision(case_dict):
    text = case_json["casebody"]["data"]["opinions"][0]['text']
    stem = PorterStemmer()
    num_sentences = len(sent_tokenize(text))
    appeals_decisions = [stem.stem('remanded'),  stem.stem('reversed'), stem.stem('affirmed')]
    curr_sentence = 0
    for sentence in sent_tokenize(text):
        for word in word_tokenize(sentence):
            root = stem.stem(word)


"""
Returns either 'appeals' or 'trial' for a single case
case_dict: dict with fields case_name, case_summary, score
percent_threshold: percentage through text to look for key appeals case words
"""
def get_court(case_dict, percent_threshold=0.0):
    text = case_dict['case_summary']
    stem = PorterStemmer()
    num_sentences = len(sent_tokenize(text))
    appeals_decisions = [stem.stem('remanded'),  stem.stem('reversed'), stem.stem('affirmed')]
    curr_sentence = 0
    for sentence in sent_tokenize(text):
        for word in word_tokenize(sentence):
            root = stem.stem(word)
            percentage_through_text = float(curr_sentence) / float(num_sentences)
            
            if (root == stem.stem('appell')
                or (root in appeals_decisions and percentage_through_text > percent_threshold) ):
                return 'appeals'
            
        curr_sentence += 1
    return 'trial'


"""
Returns dict {'appeals' : list<case_dict>, 'trial'; list<case_dict>}
Each case_dict has fields case_name, case_summary, score
case_json: list of dicts with fields case_name, case_summary, score
"""
def classify_cases(case_json):
    classified_cases_dict = {'appeals': list(), 'trial' : list()}
    
    for case_dict in case_json:
        classified_cases_dict[get_court(case_dict)].append(case_dict)
    return classified_cases_dict
            


#################################################################################################################
#### USED WITH JSON RETURNED BY CAPAPI NOT THE JSON RETURNED USING NIKHILS RANKED RESULTS #######################
#################################################################################################################
                

"""
case_json: dict with fields case_name, case_summary, score
Returns name of case: str
"""
def get_name(case_json):
    return case_json["name"]

"""
case_json: dict with fields case_name, case_summary, score
Returns court id: int
"""
def get_id(case_json):
    return case_json["id"]


"""
case_json: dict with fields case_name, case_summary, score
Returns full-text of case: str
"""
def get_full_text(case_json):
    return case_json["casebody"]["data"]["opinions"][0]['text']

# Useful Data Structures

In [35]:
"""
print("Loaded {} case transcripts".format(num_cases))
print("Each case transcript is a dictionary with the following keys...")
print(data[0].keys())


# Here, we will assign an index for each movie_id. This index will help us access data in numpy matrices.
case_id_to_index = {case_id:index for index, case_id in enumerate([d['case_id'] for d in data])}

# We will also need a dictionary maping movie names to movie ids
case_name_to_id = {name:case_id for name, case_id in zip([d['case_name'] for d in data],
                                                     [d['case_id'] for d in data])}
case_id_to_name = {v:k for k,v in case_name_to_id.items()}

# and because it might be useful...
case_name_to_index = {name:case_id_to_index[case_name_to_id[name]] for name in [d['case_name'] for d in data]}
case_index_to_name = {v:k for k,v in case_name_to_index.items()}
"""

'\nprint("Loaded {} case transcripts".format(num_cases))\nprint("Each case transcript is a dictionary with the following keys...")\nprint(data[0].keys())\n\n\n# Here, we will assign an index for each movie_id. This index will help us access data in numpy matrices.\ncase_id_to_index = {case_id:index for index, case_id in enumerate([d[\'case_id\'] for d in data])}\n\n# We will also need a dictionary maping movie names to movie ids\ncase_name_to_id = {name:case_id for name, case_id in zip([d[\'case_name\'] for d in data],\n                                                     [d[\'case_id\'] for d in data])}\ncase_id_to_name = {v:k for k,v in case_name_to_id.items()}\n\n# and because it might be useful...\ncase_name_to_index = {name:case_id_to_index[case_name_to_id[name]] for name in [d[\'case_name\'] for d in data]}\ncase_index_to_name = {v:k for k,v in case_name_to_index.items()}\n'

# Text Summarization

In [36]:
trial_count = 0
appeals_count = 0

for result in data:
    court = get_court(result)
    if court == 'appeals':
        appeals_count += 1
    else:
        trial_count += 1

print("Number of trial cases: %d" % trial_count)
print("Number of appeals cases: %d" % appeals_count)
print("\n")

Number of trial cases: 110
Number of appeals cases: 390




In [37]:
trials = classify_cases(data)['trial']

In [48]:
"""
Returns the average length of cases in characters
case_json: dict with fields case_name, case_summary, score
"""
def mean_case_length(case_json):
    
    lengths_list = list()
    for case in case_json:
        length_of_text = len(case['case_summary'])
        lengths_list.append(length_of_text)
    return np.mean(lengths_list)

"""
Returns the std dev of length of cases in characters
case_json: dict with fields case_name, case_summary, score
"""
def std_dev_case_length(case_json):
    lengths_list = list()
    for case in case_json:
        length_of_text = len(case['case_summary'])
        lengths_list.append(length_of_text)
    return np.std(lengths_list)


In [39]:
import random
i = random.randint(0, len(trials))
print("random case len: %d" % len(trials[i]['case_summary']))
print("avg case len: %d" % average_case_length(trials))

random case len: 14978
avg case len: 4162


In [127]:
"""
Returns a summary with only sentences that includes stemmed key verdict words
Precondition: every dict is a dictionary representing a trial
trials_dict: a dict with fields case_name, case_summary, score 
key_words: a list of str
"""
def get_trial_verdict_helper(trials_dict, key_words=['verdict', 'judgment']):
    text = trials_dict['case_summary'] # this is not a summary of the trial text, it is full-text
    sentences = sent_tokenize(text)
    verdict_summary = ''
    stemmed_verdict_words = [stem.stem(w) for w in key_words]
    
    for sentence in sentences:
        words = word_tokenize(sentence)
        
        stemmed_words = [stem.stem(w) for w in words]
        
     
        for stemmed_word in stemmed_words:
            if stemmed_word in stemmed_verdict_words:
                verdict_summary += ' ' + sentence
                break
                
    return verdict_summary

In [128]:
"""
Returns a dictionary with (case_name, verdict_summary) K-V pairs
Precondition: every dict is a dictionary representing a trial
trials_json: a list of dicts
"""
def get_trial_verdict_summary(trials_json):
    case_name_verdict_dict = dict()
    avg_trial_length = average_case_length(trials_json)
    for trial_dict in trials_json:
        text = trial_dict['case_summary'] # this is not a summary of the trial text, it is full-text
        preliminary_summary = get_trial_verdict_helper(trial_dict)
        num_sentences = len(sent_tokenize(preliminary_summary))
        count = 0

        last_sentences = ''
        for sentence in sent_tokenize(preliminary_summary):
            if count / num_sentences > 0.0:
                last_sentences += ' ' + sentence
            count += 1

        case_name_verdict_dict[trial_dict['case_name']] = last_sentences
    return case_name_verdict_dict

"""
Returns a dictionary with (case_name, verdict_summary) K-V pairs
Precondition: every dict is a dictionary representing an appeals case
appeals_json: a list of dicts
"""
def get_appeals_verdict_summary(appeals_json):
    pass # Will implement this later, must likely a boolean search for words like REMANDED, AFFIRMED, and REVERSED

In [8]:
"""
Returns true if a string contains a digit character, False otherwise
word: a str
"""
def contains_digit(line):
    return any(char.isdigit() for char in line)

def contains_punctuation(line):
    return any(char in string.punctuation for char in line)
    

"""
Returns a term-frequency dict with (term, frequency) key-value pairs
text_string: a str to create the term-freq dict from
"""
def create_tf_dict(text_string):
    
    # Remove stop words
    text_string = text_string.lower()
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    # Reduce words to their root form
    stem = PorterStemmer()
    
    # Create dictionary for the word frequency table
    tf_dict = dict()
    for wd in words:
        # Remove puncutation by turning puncutation to ''
        wd = wd.translate(str.maketrans('', '', string.punctuation))
        # Stem
        wd = stem.stem(wd)
        
        if wd in stop_words or wd == '':
            continue
        if wd in tf_dict:
            tf_dict[wd] += 1
        else:
            tf_dict[wd] = 1
    
    return tf_dict


In [9]:
stem = PorterStemmer()
word = stem.stem('appeal')
word

'appeal'

In [10]:
"""
Returns dict with (sentence, score) key-value pairs
sentences: list of sentences
tf_dict: term frequency dict mapping words to num occurrences in document
"""
def create_sentence_scores(sentences, tf_dict, n_chars=10):   
    sentence_weight_dict = dict()

    for sentence in sentences:
        num_words = (len(word_tokenize(sentence)))
        num_words_minus_stop_words = 0
        first_n_chars = sentence[:n_chars]
        
        for word in tf_dict:
        
            if word in sentence.lower():
                num_words_minus_stop_words += 1
                
                if not (contains_digit(first_n_chars) or contains_punctuation(first_n_chars)):

                    if first_n_chars in sentence_weight_dict:
                        sentence_weight_dict[first_n_chars] += tf_dict[word]
                    else:
                        sentence_weight_dict[first_n_chars] = tf_dict[word]
        
        if not (contains_digit(first_n_chars) or contains_punctuation(first_n_chars)):
            # Additive smoothing to avoid divide by 0
            sentence_weight_dict[first_n_chars] = ((sentence_weight_dict[first_n_chars]+1) / (num_words_minus_stop_words+1))
      
    return sentence_weight_dict


In [11]:
"""
Returns average sentence scores in a document
sentence_weight_dict: dict with (sentence, score) key-value pairs
"""
def mean_sentence_score(sentence_weight_dict):
   
    # Calculating the average score for the sentences
    sum_weights = 0
    for sentence in sentence_weight_dict:
        sum_weights += sentence_weight_dict[sentence]

    # Getting sentence average value from source text
    average_score = (sum_weights / len(sentence_weight_dict))

    return average_score

In [40]:
def create_summary(sentences, sentence_weight, threshold, n_chars=10):
    sentence_counter = 0
    article_summary = ''
    num_sentences = len(sentences)
    
    for sentence in sentences:
        
        if sentence[:n_chars] in sentence_weight and sentence_weight[sentence[:n_chars]] >= (threshold):
            article_summary += " " + sentence
        
        sentence_counter += 1

    return article_summary

In [61]:
def case_summary(case_text, multiplier=1.3):
    
    # creating a tf dictionary
    tf_dictionary = create_tf_dict(case_text)

    # tokenize sentences
    sentences = sent_tokenize(case_text)

    # algorithm for scoring a sentence by its words
    sentence_scores = create_sentence_scores(sentences, tf_dictionary)

    #getting the threshold
    threshold = mean_sentence_score(sentence_scores)

    #producing the summary
    case_summary = create_summary(sentences, sentence_scores, multiplier * threshold)

    return case_summary

In [63]:
i = 23
print(data[i]['case_summary'])

x = len(data[i]['case_summary'])
mean = mean_case_length(data)
std_dev = mean_case_length(data)

case_summary(data[i]['case_summary'])

Griffin's Appeal. 1. Where there is no dedication of land to public use by the owner, use of the same by the public jointly with the owner and by his sufferance, does not establish a right in the public by dedication, no matter how long such joint use is continued. Dedication is a matter of intention, ana when clearly proved, is as complete in one day as in twenty-one years. 2. In absence of opposing proof, long continued use by the public is evidence of an intent to dedicate, but it is not conclusive, and always yields to satisfactory contrary proof. 3. Where the owner of land, bordering on a public highway, sets his fence back from the highway for his own convenience, and uses the intervening space until his death, for private purposes, a bill for an injunction will not lie, by the municipal authorities, to restrain a subse quent owner of said property, fifteen years after the death of the former owner, from setting back the fence, and again including said intervening space within th

' The defendant’s answer denied that the fence at the point in dispute had been fixed and permanent as alleged in the bill; and averred that it had been frequently changed; that Judson Clark, a former owner, had thrown back the fence at said point which was in front of his dwelling, so as to make an unoccupied space there between the street and his house, where he could tie horses and leave wagons when not in use ; that the space so thrown open had not been dedicated to public use, and was the space now fenced in by the present owner of the Clark homestead, and finally that the defendant was not obstructing the street, but was building his fence upon the line established by the city authorities of Scranton. Soon after Mr. Clark purchased, he moved the road fence back on the side next his house, leaving an open space between the fence and the road bed, some twenty-five or thirty feet in width. This space was so used by Mr. Clark up to the time of his death in 1860. The defendant now see