In [92]:
import pandas as pd
import numpy as np
import regex as re
import pickle

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV
import pyLDAvis.sklearn
import pyLDAvis
pyLDAvis.enable_notebook()

import gensim
from gensim.summarization import keywords, summarize, mz_keywords
from gensim.models import doc2vec


In [3]:
data = pd.read_csv('cases_IL_12k_raw.csv')

In [4]:
data.shape

(12000, 5)

In [5]:
data.head()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year
0,3269062,Illinois,Mr. JUSTICE DOWNING\ndelivered the opinion of ...,Illinois Appellate Court,1979
1,2683250,Illinois,Mr. Chief Justice Scott\ndelivered the opinion...,Illinois Supreme Court,1875
2,2673531,Illinois,Mr. Justice Dickey\ndelivered the opinion of t...,Illinois Supreme Court,1876
3,3125662,Illinois,Mr. JUSTICE McGLOON\ndelivered the opinion of ...,Illinois Appellate Court,1981
4,3501926,Illinois,JUSTICE BARRY\ndelivered the opinion of the co...,Illinois Appellate Court,1986


In [6]:
data.tail()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year
11995,5541954,Illinois,Mr. Justice Ryner\ndelivered the opinion of th...,Illinois Appellate Court,1930
11996,3493372,Illinois,JUSTICE HOPF\ndelivered the opinion of the cou...,Illinois Appellate Court,1985
11997,5542319,Illinois,Mr. Presiding Justice Scanlan\ndelivered the o...,Illinois Appellate Court,1931
11998,5270789,Illinois,Mr. Justice DeYoung\ndelivered the opinion of ...,Illinois Supreme Court,1932
11999,2838413,Illinois,Mr. Chief Justice Klingbiel\ndelivered the opi...,Illinois Supreme Court,1965


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   case_id        12000 non-null  int64 
 1   jurisdiction   12000 non-null  object
 2   opinion        12000 non-null  object
 3   court_name     12000 non-null  object
 4   decision_year  12000 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 468.9+ KB


In [11]:
data.describe()

Unnamed: 0,case_id,decision_year
count,12000.0,12000.0
mean,3755000.0,1935.074083
std,1635113.0,42.115841
min,25372.0,1830.0
25%,2676440.0,1901.0
50%,3320570.0,1930.0
75%,5218119.0,1977.0
max,12255620.0,2011.0


# Extracting more information

In [24]:
judge_name = []

for item in data.opinion:
    judge_name.append( item.split('\n')[0].lower()) 
    
data['judge_name'] = judge_name

In [116]:
article_citations = []

for item in data.opinion:
    
    item_cite = []
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\dd\s\d+', item))
      
    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\w+.\s\w+\s\d+', item))

    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\w+.\s\w+,\s\w+', item))

    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\d+\s\w+.\s\d+', item))
        
    except: 
        pass
    
    article_citations.append(item_cite)
    
        
data['citations'] = article_citations

In [118]:
data.tail()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year,judge_name,citations
11995,5541954,Illinois,Mr. Justice Ryner\ndelivered the opinion of th...,Illinois Appellate Court,1930,mr. justice ryner,"[30 Cyc. 106, 51 Cal. 511, 46 Mich. 236, 280 I..."
11996,3493372,Illinois,JUSTICE HOPF\ndelivered the opinion of the cou...,Illinois Appellate Court,1985,justice hopf,"[(1976), 85 Misc. 2d 891, (1965), 33 Ill. 2d 3..."
11997,5542319,Illinois,Mr. Presiding Justice Scanlan\ndelivered the o...,Illinois Appellate Court,1931,mr. presiding justice scanlan,[]
11998,5270789,Illinois,Mr. Justice DeYoung\ndelivered the opinion of ...,Illinois Supreme Court,1932,mr. justice deyoung,"[339 Ill. 28, 86 id. 107, 307 Ill. 556, 187 id..."
11999,2838413,Illinois,Mr. Chief Justice Klingbiel\ndelivered the opi...,Illinois Supreme Court,1965,mr. chief justice klingbiel,"[372 Ill. 336, 216 Ill. 354, 217 Cal. 517, 167..."


# Cleaning Text Data

In [171]:
class Datacleaner(TransformerMixin):
    

    def fit(self, raw_text):
        return self
    
    def clean(self, raw_text):
        
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        
        clean_text = BeautifulSoup(raw_text).get_text()
        lower_cases = clean_text.lower()
        tokens = tokenizer.tokenize(lower_cases)
        lemms = [lemmatizer.lemmatize(word) for word in tokens]
        words = [word for word in lemms if word not in stopwords.words('english')]
        final_words = ' '.join(words)
        
        return final_words
    
    def clean_col(self, col):
        
        clean_list = []
        n = 0
        for item in col:
            n += 1
            if n % 25 == 0:
                print(n)
            clean_list.append( self.clean(item) )
            
        return clean_list
        
       

In [172]:
cleaner = Datacleaner()

data['clean_opinion'] = cleaner.clean_col(data.opinion)

25
50
75
100
125
150
175
200
225
250
275
300
325
350
375
400
425
450
475
500
525
550
575
600
625
650
675
700
725
750
775
800
825
850
875
900
925
950
975
1000
1025
1050
1075
1100
1125
1150
1175
1200
1225
1250
1275
1300
1325
1350
1375
1400
1425
1450
1475
1500
1525
1550
1575
1600
1625
1650
1675
1700
1725
1750
1775
1800
1825
1850
1875
1900
1925
1950
1975
2000
2025
2050
2075
2100
2125
2150
2175
2200
2225
2250
2275
2300
2325
2350
2375
2400
2425
2450
2475
2500
2525
2550
2575
2600
2625
2650
2675
2700
2725
2750
2775
2800
2825
2850
2875
2900
2925
2950
2975
3000
3025
3050
3075
3100
3125
3150
3175
3200
3225
3250
3275
3300
3325
3350
3375
3400
3425
3450
3475
3500
3525
3550
3575
3600
3625
3650
3675
3700
3725
3750
3775
3800
3825
3850
3875
3900
3925
3950
3975
4000
4025
4050
4075
4100
4125
4150
4175
4200
4225
4250
4275
4300
4325
4350
4375
4400
4425
4450
4475
4500
4525
4550
4575
4600
4625
4650
4675
4700
4725
4750
4775
4800
4825
4850
4875
4900
4925
4950
4975
5000
5025
5050
5075
5100
5125
5150
5175
5200
52

In [174]:
# data.to_csv('clean_opinions.csv')

In [4]:
data = pd.read_csv('clean_opinions.csv', index_col=0)

In [5]:
data.head()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year,judge_name,citations,clean_opinion
0,3269062,Illinois,Mr. JUSTICE DOWNING\ndelivered the opinion of ...,Illinois Appellate Court,1979,mr. justice downing,"['(1967), 37 Ill. 2d 494', '(1976), 64 Ill. 2d...",mr justice downing delivered opinion court pla...
1,2683250,Illinois,Mr. Chief Justice Scott\ndelivered the opinion...,Illinois Supreme Court,1875,mr. chief justice scott,"['53 Ill. 61', '56 ib. 163', '66 ib. 288', '68...",mr chief justice scott delivered opinion court...
2,2673531,Illinois,Mr. Justice Dickey\ndelivered the opinion of t...,Illinois Supreme Court,1876,mr. justice dickey,[],mr justice dickey delivered opinion court wa a...
3,3125662,Illinois,Mr. JUSTICE McGLOON\ndelivered the opinion of ...,Illinois Appellate Court,1981,mr. justice mcgloon,"['(1966), 70 Ill. App. 2d 97', '778 and 1244',...",mr justice mcgloon delivered opinion court con...
4,3501926,Illinois,JUSTICE BARRY\ndelivered the opinion of the co...,Illinois Appellate Court,1986,justice barry,"['(1978), 74 Ill. 2d 172', '(1967), 37 Ill. 2d...",justice barry delivered opinion court petition...


# Bag of Words (Also finding corpus specfic stopwords)

In [70]:
cvec = CountVectorizer(max_features=50, strip_accents='unicode')
non_dense_count_df = cvec.fit_transform(data.clean_opinion)
count_df = pd.DataFrame(cvec.fit_transform(data.clean_opinion).todense(), columns=cvec.get_feature_names())

In [71]:
most_common_words = dict(count_df.sum().sort_values()[::-1])

In [72]:
stop_words = list(stopwords.words('english'))

sw = list(most_common_words.keys())

stop_words.extend(sw)

In [73]:
stop_words.remove('property')
stop_words.remove('contract')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

# Summarization & Keywords

In [10]:
data.head()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year,judge_name,citations,clean_opinion
0,3269062,Illinois,Mr. JUSTICE DOWNING\ndelivered the opinion of ...,Illinois Appellate Court,1979,mr. justice downing,"['(1967), 37 Ill. 2d 494', '(1976), 64 Ill. 2d...",mr justice downing delivered opinion court pla...
1,2683250,Illinois,Mr. Chief Justice Scott\ndelivered the opinion...,Illinois Supreme Court,1875,mr. chief justice scott,"['53 Ill. 61', '56 ib. 163', '66 ib. 288', '68...",mr chief justice scott delivered opinion court...
2,2673531,Illinois,Mr. Justice Dickey\ndelivered the opinion of t...,Illinois Supreme Court,1876,mr. justice dickey,[],mr justice dickey delivered opinion court wa a...
3,3125662,Illinois,Mr. JUSTICE McGLOON\ndelivered the opinion of ...,Illinois Appellate Court,1981,mr. justice mcgloon,"['(1966), 70 Ill. App. 2d 97', '778 and 1244',...",mr justice mcgloon delivered opinion court con...
4,3501926,Illinois,JUSTICE BARRY\ndelivered the opinion of the co...,Illinois Appellate Court,1986,justice barry,"['(1978), 74 Ill. 2d 172', '(1967), 37 Ill. 2d...",justice barry delivered opinion court petition...


In [11]:
keywords(data['clean_opinion'][0], split=True, ratio=.03)

['court plaintiff',
 'property',
 'defendant',
 'contract',
 'gala',
 'broker',
 'agreement sale motel',
 'evidence',
 'evident',
 'kotrich',
 'day']

In [12]:
kw = []
n = -1
for item in data.clean_opinion:
    n += 1
    if n % 50 == 0:
        print(f'Fetching keywords for #{n}')
    if len(item) > 100:
        try:
            kw.append(keywords(item, split=True, words=5))
        except:
            kw.append(keywords(item, split=True, ratio=.1))

    else:
        kw.append(item)


Fetching keywords for #0
Fetching keywords for #50
Fetching keywords for #100
Fetching keywords for #150
Fetching keywords for #200
Fetching keywords for #250
Fetching keywords for #300
Fetching keywords for #350
Fetching keywords for #400
Fetching keywords for #450
Fetching keywords for #500
Fetching keywords for #550
Fetching keywords for #600
Fetching keywords for #650
Fetching keywords for #700
Fetching keywords for #750
Fetching keywords for #800
Fetching keywords for #850
Fetching keywords for #900
Fetching keywords for #950
Fetching keywords for #1000
Fetching keywords for #1050
Fetching keywords for #1100
Fetching keywords for #1150
Fetching keywords for #1200
Fetching keywords for #1250
Fetching keywords for #1300
Fetching keywords for #1350
Fetching keywords for #1400
Fetching keywords for #1450
Fetching keywords for #1500
Fetching keywords for #1550
Fetching keywords for #1600
Fetching keywords for #1650
Fetching keywords for #1700
Fetching keywords for #1750
Fetching keywor

In [15]:
data['keywords'] = kw

In [28]:
summarize(data.opinion[0], word_count=250).replace('\n', '')

'Admitted into evidence was a letter dated October 28, 1971, addressed to plaintiff and signed by Robert Galas, in which Galas stated that on September 10, 1971, he telephoned plaintiff’s office regarding an ad he had seen in the paper on July 18, 1971; that plaintiff told him information about the property would be mailed to him as no information could be given on the phone; that he received plaintiff’s letter dated September 14, 1971, which informed him that the motel for sale was the LaGrange Motel; that he had been informed several months earlier by his own real estate broker, William Brash, that this motel was for sale; that at that time he discounted the LaGrange Motel because of his financial situation; that on October 1, 1971, he was able to secure additional finances; that he called Brash and they again discussed the LaGrange Motel; that on October 7, 1971, through Brash he made an offer on the motel; that on the same date he learned that plaintiff tried to reach him while he 

In [36]:
summs = []

for item in data.opinion:
    if len(item) > 50:
        try:
            summs.append(summarize(item, word_count=250).replace('\n', ''))
        except:
            summs.append(summarize(item, ratio=.5).replace('\n', ''))
    else:
        summs.append(item)


In [38]:
data['summs'] = summs

In [99]:
# data.to_csv('data/law_df.csv')

In [40]:
data.head()

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year,judge_name,citations,clean_opinion,keywords,summs
0,3269062,Illinois,Mr. JUSTICE DOWNING\ndelivered the opinion of ...,Illinois Appellate Court,1979,mr. justice downing,"['(1967), 37 Ill. 2d 494', '(1976), 64 Ill. 2d...",mr justice downing delivered opinion court pla...,"[plaintiff, property, agreement, defendant, co...",Admitted into evidence was a letter dated Octo...
1,2683250,Illinois,Mr. Chief Justice Scott\ndelivered the opinion...,Illinois Supreme Court,1875,mr. chief justice scott,"['53 Ill. 61', '56 ib. 163', '66 ib. 288', '68...",mr chief justice scott delivered opinion court...,"[sale, property, party, trustee, mortgage]","Unless, therefore, the mortgage had been relea..."
2,2673531,Illinois,Mr. Justice Dickey\ndelivered the opinion of t...,Illinois Supreme Court,1876,mr. justice dickey,[],mr justice dickey delivered opinion court wa a...,"[justice, court, judgment, action appellant]",delivered the opinion of the Court:This was an...
3,3125662,Illinois,Mr. JUSTICE McGLOON\ndelivered the opinion of ...,Illinois Appellate Court,1981,mr. justice mcgloon,"['(1966), 70 Ill. App. 2d 97', '778 and 1244',...",mr justice mcgloon delivered opinion court con...,"[candidate, election, elected, elect, city, re...","Shortly thereafter, petitioners challenged the..."
4,3501926,Illinois,JUSTICE BARRY\ndelivered the opinion of the co...,Illinois Appellate Court,1986,justice barry,"['(1978), 74 Ill. 2d 172', '(1967), 37 Ill. 2d...",justice barry delivered opinion court petition...,"[petitioner, commission, reported, report, com...",For the petitioner’s 20% permanent and complet...


# Doc2Vec

In [67]:
data = pd.read_csv('data/law_df.csv', index_col=0)

In [5]:
def tag_docs(documents):
    
    for i, line in enumerate(documents):
        tokens = gensim.utils.simple_preprocess(line)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [6]:
train_data = list(tag_docs(data.clean_opinion))

In [7]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [8]:
model.build_vocab(train_data)

In [9]:
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
model

<gensim.models.doc2vec.Doc2Vec at 0x15a75d860>

In [11]:
# model.save('models/d2v_model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
example = train_data[0]
example

TaggedDocument(words=['mr', 'justice', 'downing', 'delivered', 'opinion', 'court', 'plaintiff', 'john', 'kokinis', 'licensed', 'real', 'estate', 'broker', 'brought', 'action', 'defendant', 'edward', 'rita', 'kotrich', 'recover', 'commission', 'allegedly', 'due', 'term', 'exclusive', 'listing', 'agreement', 'sale', 'motel', 'plaintiff', 'appeal', 'finding', 'entered', 'circuit', 'court', 'cook', 'county', 'favor', 'defendant', 'close', 'plaintiff', 'case', 'issue', 'review', 'whether', 'trial', 'court', 'action', 'wa', 'correct', 'raise', 'question', 'whether', 'finding', 'wa', 'manifest', 'weight', 'evidence', 'defendant', 'owned', 'lagrange', 'motel', 'countryside', 'illinois', 'june', 'signing', 'document', 'entitled', 'north', 'side', 'real', 'estate', 'board', 'cooperative', 'listing', 'service', 'agreement', 'employed', 'plaintiff', 'exclusive', 'agent', 'advertise', 'offer', 'sale', 'motel', 'price', 'defendant', 'agreed', 'pay', 'plaintiff', 'real', 'estate', 'brokerage', 'commi

In [16]:
sample_vector = model.infer_vector(example.words)

In [45]:
n = 0
for item in model.docvecs.most_similar([sample_vector])[1:]:
    n += 1

    print(f'Most Similar Case # {n}: \nCase #{item[0]}\nSimilarity Score: {item[1]}\nKeywords: {data.keywords[item[0]]}\n\nCase Summary:\n{data.summs[item[0]]}\n\n')

Most Similar Case # 1: 
Case #8004
Similarity Score: 0.8341332077980042
Keywords: ['sale', 'plaintiff', 'defendant', 'commission', 'listing', 'listed']

Case Summary:
In issue is whether plaintiff’s services as an independent contractor in defendant’s office pursuant to a written agreement had been terminated, voluntarily or involuntarily, or whether she had agreed to be replaced by defendant as a “listing associate” before she became entitled to a commission on the “sale” of a particular property.At the time the Walters signed a listing agreement for the houses in defendant’s office, the sale of a 135-acre farm in Kane County, owned one-half by an estate in which Mrs. Walters was the executor and one-half by Mrs. Walters was discussed.Rather, he argues that paragraph 13 is unambiguous and requires that the listing sales associate be in the brokerage office as of the signing of the formal contract of sale and purchase of the farm, on September 12, 1975; and that her association had bee

In [50]:
polarity_scores = []

for i in range(0, len(data)):
    vector = model.infer_vector(train_data[i].words)[0]
    polarity_scores.append(vector)

In [51]:
data['d2v_scores'] = polarity_scores

In [68]:
data.head(1)

Unnamed: 0,case_id,jurisdiction,opinion,court_name,decision_year,judge_name,citations,clean_opinion,keywords,summs,d2v_scores
0,3269062,Illinois,Mr. JUSTICE DOWNING\ndelivered the opinion of ...,Illinois Appellate Court,1979,mr. justice downing,"['(1967), 37 Ill. 2d 494', '(1976), 64 Ill. 2d...",mr justice downing delivered opinion court pla...,"['plaintiff', 'property', 'agreement', 'defend...",Admitted into evidence was a letter dated Octo...,0.329142


# LDA Topic Modeling

In [74]:
cvec = CountVectorizer(strip_accents='unicode',
                        stop_words=stop_words) #stop_words include legal corpus specific 

In [77]:
non_dense_count_df = cvec.fit_transform(data.clean_opinion)

In [106]:
# pickle.dump(cvec, open('models/cvec', 'wb'))

In [80]:
count_df = pd.DataFrame(non_dense_count_df.todense(), columns=cvec.get_feature_names())
count_df.head()

Unnamed: 0,00,000,0000,000001,000020,0002,00030,00035,0006,0008,...,zydlo,zygadlo,zygmunt,zygo,zygoma,zygomatic,zyprexa,zywicki,zzot,ætna
0,0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [89]:
model = LDA(n_components=5, random_state=42)

In [90]:
model.fit(non_dense_count_df)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [91]:
pyLDAvis.sklearn.prepare(model, non_dense_count_df, cvec)

In [93]:
# pickle.dump(model, open('models/lda_model', 'wb'))

In [94]:
col_names = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5']
lda_preds = pd.DataFrame(model.transform(non_dense_count_df), columns=col_names)
lda_preds.head()

Unnamed: 0,topic1,topic2,topic3,topic4,topic5
0,0.000132,0.000131,0.135816,0.12861,0.735311
1,0.000114,0.172791,0.000113,0.735667,0.091316
2,0.00175,0.901875,0.092902,0.001741,0.001732
3,0.000139,0.776169,0.208412,0.015138,0.000142
4,0.090903,0.000276,0.69188,0.000275,0.216666


In [97]:
lda_topic_preds = []

for i in range(0, len(lda_preds)):
    sims = sorted(enumerate(lda_preds.iloc[i], 1), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
data['lda_preds'] = lda_topic_preds

In [108]:
data.court_name.sort_values().unique()

array(['Illinois Appellate Court', 'Illinois Circuit Court',
       'Illinois Court of Claims', 'Illinois Supreme Court'], dtype=object)

In [104]:
def ldamodel(query):
    '''
    input: user query
    output: lda topic it belongs to
    '''
    count_query = cvec.transform(query)
    topic_likelihood = model.transform(count_query)[0]
    topic = sorted(enumerate(topic_likelihood, 1), key=lambda x: x[1], reverse=True)[0][0]
    
    return topic

In [105]:
ldamodel(['realestate property mortgage fine'])

4