# Data Modeling on Entire Corpus

This notebook trains models for the final application based on the batch training performed in the data_modeling_on_subsection_of_data notebook. This notebook can be seperated into the following eight sections:

* [Citation Extraction](#CE)<a href='#CE'>
    
* [Text Cleaning](#CTD)<a href='#CTD'>
    
* [Bag of words](#BOW)<a href='#BOW'>
    
* [Summaries and Keywords](#SK)<a href='#SK'>
    
* [Doc2Vec](#D2V)<a href='#D2V'>

* [Latent Dirochlet Allocation](#LDA)<a href='#LDA'>
    
* [Hierarchical Latent Dirochlet Allocation](#HLDA)<a href='#HLDA'>
    
* [Final Revisions](#FR)<a href='#FR'>

In [1]:
import pandas as pd
import numpy as np
import regex as re
import pickle
from collections import defaultdict

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.model_selection import GridSearchCV
import pyLDAvis.sklearn
import pyLDAvis
pyLDAvis.enable_notebook()

import gensim
from gensim.summarization import keywords, summarize, mz_keywords
from gensim.models import doc2vec


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
data = pd.read_csv('data/legal_dataframe.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.shape

(183149, 13)

In [4]:
data.head()

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text
0,2747110,"The People of the State of Illinois, Plaintiff...",People v. Tobin,1771-10-12,538,543,2 Ill. App. 3d 538,2,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Mr. PRESIDING JUSTICE EBERSPACHER\ndelivered t...
1,435537,"James A. Whitesides and others, Plaintiffs in ...",Whitesides v. People,1819-12,21,22,1 Ill. 21,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court. This was a criminal pros...
2,435638,"Amos Chipps, Appellant, v. Thomas Yancey, Appe...",Chipps v. Yancey,1819-12,19,19,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...
3,435690,"Jonathan Taylor, Appellant, v. Michael Sprinkl...",Taylor v. Sprinkle,1819-12,17,18,1 Ill. 17,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...
4,435710,"François Coleen and Abraham Claypole, Appellan...",Coleen v. Figgins,1819-12,19,20,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n†\nIt appears from the ...


In [5]:
data.tail()

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text
183144,4289128,"GENERAL MOTORS CORPORATION et al., Appellees, ...",General Motors Corp. v. Pappas,2011-05-19,163,189,242 Ill. 2d 163,242,"Illinois Reports, Second Series",Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",JUSTICE THEIS\ndelivered the judgment of the c...
183145,7328627,"HARRY BALOUGH, Plaintiff-Appellee, v. NORTHEAS...",Balough v. Northeast Illinois Regional Commute...,2011-05-19,750,780,409 Ill. App. 3d 750,409,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",JUSTICE PUCINSKI\ndelivered the judgment of th...
183146,7328860,"In re CHARLES H., a Person Found Subject to In...",People v. Charles H.,2011-05-20,1047,1058,409 Ill. App. 3d 1047,409,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",JUSTICE POPE\ndelivered the judgment of the co...
183147,7328871,"THE PEOPLE OF THE STATE OF ILLINOIS, Plaintiff...",People v. Isaacson,2011-05-20,1079,1086,409 Ill. App. 3d 1079,409,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",JUSTICE TURNER\ndelivered the judgment of the ...
183148,7328886,"In re ESTATE OF RONALD D. WEEKS, Deceased (Dav...",Hammer v. People ex rel. Madigan,2011-05-20,1101,1114,409 Ill. App. 3d 1101,409,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",PRESIDING JUSTICE KNECHT\ndelivered the judgme...


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 183149 entries, 0 to 183148
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   case_id             183149 non-null  int64 
 1   name                183149 non-null  object
 2   name_abbreviation   183149 non-null  object
 3   decision_date       183149 non-null  object
 4   first_page          183149 non-null  object
 5   last_page           183149 non-null  object
 6   case_citation_name  183149 non-null  object
 7   volume              183149 non-null  int64 
 8   reporter            183149 non-null  object
 9   court               183149 non-null  object
 10  jurisdiction        183149 non-null  object
 11  casebody            183149 non-null  object
 12  text                183149 non-null  object
dtypes: int64(2), object(11)
memory usage: 19.6+ MB


In [7]:
data.describe()

Unnamed: 0,case_id,volume
count,183149.0,183149.0
mean,3710537.0,159.69201
std,1617843.0,114.108253
min,25332.0,1.0
25%,2700087.0,54.0
50%,3305630.0,145.0
75%,5229687.0,250.0
max,12255940.0,415.0


# Extracting Citations
<a id='CE'></a>

In [14]:
article_citations = []

for item in data.text:
    
    item_cite = []
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\dd\s\d+', item))
      
    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\w+.\s\w+\s\d+', item))

    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\(\d{4}\),\s\d+\s\w+.\s\w+.\s\d{2,3}', item))

    except ExplicitException:
        pass
    
    try:
        item_cite.extend(re.findall(r'\d+\s\D{2,4}.\s\d{2,5}', item))
        
    except: 
        pass
    
    article_citations.append(list(set(item_cite)))
    if len(article_citations) % 10000 == 0:
        print(len(article_citations))
        
data['citations'] = article_citations

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000


In [21]:
data.citations = data.citations.apply(lambda y: np.nan if (type(y) == list and len(y) == 0) else y)

# Clean Text Data
<a id='CTD'></a>

In [27]:
class Datacleaner(TransformerMixin):
    

    def fit(self, raw_text):
        return self
    
    def clean(self, raw_text):
        
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        
        clean_text = BeautifulSoup(raw_text).get_text()
        lower_cases = clean_text.lower()
        tokens = tokenizer.tokenize(lower_cases)
        lemms = [lemmatizer.lemmatize(word) for word in tokens]
        words = [word for word in lemms if word not in stopwords.words('english')]
        final_words = ' '.join(words)
        
        return final_words
    
    def clean_col(self, col):
        
        clean_list = []
        n = 0
        for item in col:
            n += 1
            if n % 10000 == 0:
                print(n)
            clean_list.append( self.clean(item) )
            
        return clean_list
        

In [32]:
cleaner = Datacleaner()

data['clean_text'] = cleaner.clean_col(data.text)

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000


In [36]:
data.clean_text[0][:100]

'mr presiding justice eberspacher delivered opinion court defendant tobin wa convicted jury crime bur'

In [2]:
# data = pd.read_csv('data/backup.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


# Bag of Words (Also finding corpus specfic stopwords)
<a id='BOW'></a>

In [41]:
cvec = CountVectorizer(max_features=50, strip_accents='unicode')
non_dense_count_df = cvec.fit_transform(data.clean_text)
count_df = pd.DataFrame(cvec.fit_transform(data.clean_text).todense(), columns=cvec.get_feature_names())

In [42]:
most_common_words = dict(count_df.sum().sort_values()[::-1])

In [43]:
stop_words = list(stopwords.words('english'))

sw = list(most_common_words.keys())[:29]

In [44]:
stop_words.extend(sw)

# Summarization & Keywords
<a id='SK'></a>

In [55]:
data.head()

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text,citations,clean_text,keywords
0,2747110,"The People of the State of Illinois, Plaintiff...",People v. Tobin,1771-10-12,538,543,2 Ill. App. 3d 538,2,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Mr. PRESIDING JUSTICE EBERSPACHER\ndelivered t...,"[1 and 119, 89 S.Ct. 2100, 395 U.S. 959]",mr presiding justice eberspacher delivered opi...,"[sentenced, sentence, sentencing, court defend..."
1,435537,"James A. Whitesides and others, Plaintiffs in ...",Whitesides v. People,1819-12,21,22,1 Ill. 21,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court. This was a criminal pros...,,opinion court wa criminal prosecution riot pla...,"[indictment, form, criminal, shall, year]"
2,435638,"Amos Chipps, Appellant, v. Thomas Yancey, Appe...",Chipps v. Yancey,1819-12,19,19,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,"[2 Dall, 302]",opinion court wa action debt judgment rendered...,"[action, court, judgment, plea, nil]"
3,435690,"Jonathan Taylor, Appellant, v. Michael Sprinkl...",Taylor v. Sprinkle,1819-12,17,18,1 Ill. 17,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,,opinion court wa action covenant fifth plea st...,"[plea, court, post, failure, illinois]"
4,435710,"François Coleen and Abraham Claypole, Appellan...",Coleen v. Figgins,1819-12,19,20,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n†\nIt appears from the ...,,opinion court appears record cause writ issued...,"[court appears, scam, appearance, appearing, p..."


In [52]:
keywords(data['clean_text'][0], split=True, ratio=.03)

['sentenced',
 'sentence',
 'sentencing',
 'court defendant tobin',
 'building',
 'ill',
 'state',
 'evidence',
 'stated car',
 'authority',
 'door']

In [53]:
kw = []
n = -1
for item in data.clean_text:
    n += 1
    if n % 10000 == 0:
        print(f'Fetching keywords for #{n}')
    if len(item) > 100:
        try:
            kw.append(keywords(item, split=True, words=5))
        except:
            kw.append(keywords(item, split=True, ratio=.1))

    else:
        kw.append(item)


Fetching keywords for #0
Fetching keywords for #10000
Fetching keywords for #20000
Fetching keywords for #30000
Fetching keywords for #40000
Fetching keywords for #50000
Fetching keywords for #60000
Fetching keywords for #70000
Fetching keywords for #80000
Fetching keywords for #90000
Fetching keywords for #100000
Fetching keywords for #110000
Fetching keywords for #120000
Fetching keywords for #130000
Fetching keywords for #140000
Fetching keywords for #150000
Fetching keywords for #160000
Fetching keywords for #170000
Fetching keywords for #180000


In [54]:
data['keywords'] = kw

In [57]:
summarize(data.text[0], word_count=250).replace('\n', '')

'The court entered judgment upon the verdict and sentenced the defendant to a fifteen to twenty-five year term in the Illinois State Penitentiary.The defendant has appealed from that judgment and raised the following issues: (1) The State failed to prove lack of authority to enter the premises; (2) The State failed to prove intent to commit a theft; (3) The court erred in allowing testimony concerning the arrest of Sherri Tobin, her possession of a firearm and evidence concerning defendant’s possession of a firearm; (4) The sentence was excessive.The facts giving rise to this case are as follows: On the night of February 9, 1969, at about 11:00 P.M. the defendant, in the company of Sherri Tobin, Daniel Stout, Michael Hume and Eddie Dunn was in an automobile driven by defendant in the vicinity of the Oliver C.He stated the car stopped by the agency and the driver, identified as Tobin, jumped out and ran across the street and kicked the agency door.In regard to the question of intent the

In [7]:
summs = []
n = 0

for item in data.text:
    n+= 1
    if n % 10000 == 0:
        print(f'Fetching summary for #{n}')
        
    if len(item) > 1000:
        try:
            summs.append(summarize(item, word_count=250).replace('\n', ''))
        except:
            summs.append(summarize(item, ratio=.5).replace('\n', ''))
    else:
        summs.append(item)


Fetching summary for #10000
Fetching summary for #20000
Fetching summary for #30000
Fetching summary for #40000
Fetching summary for #50000
Fetching summary for #60000
Fetching summary for #70000
Fetching summary for #80000
Fetching summary for #90000
Fetching summary for #100000
Fetching summary for #110000
Fetching summary for #120000
Fetching summary for #130000
Fetching summary for #140000
Fetching summary for #150000
Fetching summary for #160000
Fetching summary for #170000
Fetching summary for #180000


In [8]:
data['summs'] = summs

In [13]:
data.summs.fillna(data.text, inplace=True)

# Doc2Vec
<a id='D2V'></a>

In [14]:
def tag_docs(documents):
    
    for i, line in enumerate(documents):
        tokens = gensim.utils.simple_preprocess(line)
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [15]:
train_data = list(tag_docs(data.clean_text))

In [16]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [17]:
model.build_vocab(train_data)

In [18]:
model.train(train_data, total_examples=model.corpus_count, epochs=model.epochs)

In [19]:
model

<gensim.models.doc2vec.Doc2Vec at 0x567f153c8>

In [20]:
example = train_data[0]

In [21]:
sample_vector = model.infer_vector(example.words)
sample_vector

array([ 1.5324306 ,  2.1731114 ,  0.5358954 ,  0.69633293,  0.59287786,
       -0.56913793, -0.8112813 , -0.91343445,  0.36389783,  1.1947542 ,
        1.4784077 ,  1.4262766 ,  0.78247553,  0.53290284, -0.0711604 ,
       -0.9184037 ,  0.47452053,  0.6324937 , -1.0646518 ,  0.2243761 ,
        0.04560428, -0.43814626, -0.4730711 ,  1.1807055 , -1.2522606 ,
        1.5006284 ,  1.4920942 , -1.4339331 , -1.7539495 ,  1.9458628 ,
       -0.58788145, -0.3910977 ,  0.274218  ,  0.8812365 , -2.36569   ,
        1.3564054 , -1.8362895 , -0.05379141, -0.6475554 , -0.37915578,
        0.5082969 ,  0.65028775,  0.5514563 , -1.1354003 , -0.6905312 ,
       -0.07784279, -1.0563327 , -0.5240961 ,  1.1317965 , -0.56958365],
      dtype=float32)

In [22]:
n = 0
for item in model.docvecs.most_similar([sample_vector])[1:]:
    n += 1

    print(f'Most Similar Case # {n}: \nCase #{item[0]}\nSimilarity Score: {item[1]}\nKeywords: {data.keywords[item[0]]}\n\nCase Summary:\n{data.summs[item[0]]}\n\n')

Most Similar Case # 1: 
Case #153648
Similarity Score: 0.7980687618255615
Keywords: ['defendant', 'sentenced', 'sentence', 'sentencing', 'court', 'said', 'trial']

Case Summary:
On appeal, defendant maintains: (1) he was not proved guilty beyond a reasonable doubt; (2) there was no probable cause to arrest defendant, and, therefore, the court erred in denying defendant’s motion to suppress items seized without a warrant and statements made subsequent to the arrest; (3) defendant was deprived of his right to a fair trial by comments made by the State during closing argument; and (4) the trial court erred in sentencing defendant to consecutive terms of imprisonment.He said he then conducted a search of the area and found a green Army duffel bag, an aluminum suitcase, a green toolbox, and a six pack of beer in a railroad “subway ditch,” which was approximately 100 yards from the place where defendant was being detained.He said he then, at 5:15 a.m., instructed Officer Beason to place defe

In [23]:
polarity_scores = []

for i in range(0, len(data)):
    vector = model.infer_vector(train_data[i].words)[0]
    polarity_scores.append(vector)

In [24]:
data['d2v_scores'] = polarity_scores

In [25]:
data.head(1)

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text,citations,clean_text,keywords,summs,d2v_scores,lda_preds
0,2747110,"The People of the State of Illinois, Plaintiff...",People v. Tobin,1771-10-12,538,543,2 Ill. App. 3d 538,2,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Mr. PRESIDING JUSTICE EBERSPACHER\ndelivered t...,"['1 and 119', '89 S.Ct. 2100', '395 U.S. 959']",mr presiding justice eberspacher delivered opi...,"['sentenced', 'sentence', 'sentencing', 'court...",The court entered judgment upon the verdict an...,1.3064,4


# LDA Topic Modeling
<a id='LDA'></a>

In [26]:
data.head()

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text,citations,clean_text,keywords,summs,d2v_scores,lda_preds
0,2747110,"The People of the State of Illinois, Plaintiff...",People v. Tobin,1771-10-12,538,543,2 Ill. App. 3d 538,2,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Mr. PRESIDING JUSTICE EBERSPACHER\ndelivered t...,"['1 and 119', '89 S.Ct. 2100', '395 U.S. 959']",mr presiding justice eberspacher delivered opi...,"['sentenced', 'sentence', 'sentencing', 'court...",The court entered judgment upon the verdict an...,1.3064,4
1,435537,"James A. Whitesides and others, Plaintiffs in ...",Whitesides v. People,1819-12,21,22,1 Ill. 21,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court. This was a criminal pros...,,opinion court wa criminal prosecution riot pla...,"['indictment', 'form', 'criminal', 'shall', 'y...","This was a criminal prosecution for a riot, ag...",1.451599,4
2,435638,"Amos Chipps, Appellant, v. Thomas Yancey, Appe...",Chipps v. Yancey,1819-12,19,19,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,"['2 Dall, 302']",opinion court wa action debt judgment rendered...,"['action', 'court', 'judgment', 'plea', 'nil']",Opinion of the Court.This was an action of deb...,0.455963,3
3,435690,"Jonathan Taylor, Appellant, v. Michael Sprinkl...",Taylor v. Sprinkle,1819-12,17,18,1 Ill. 17,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,,opinion court wa action covenant fifth plea st...,"['plea', 'court', 'post', 'failure', 'illinois']",Opinion of the Court.This was an action of cov...,0.99483,3
4,435710,"François Coleen and Abraham Claypole, Appellan...",Coleen v. Figgins,1819-12,19,20,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n†\nIt appears from the ...,,opinion court appears record cause writ issued...,"['court appears', 'scam', 'appearance', 'appea...","It appears from the record in this cause, that...",0.522237,3


In [45]:
cvec = CountVectorizer(strip_accents='unicode',
                       min_df=5,
                        stop_words=stop_words) #stop_words include legal corpus specific 

In [39]:
non_dense_count_df = cvec.fit_transform(data.clean_text)

In [31]:
# count_df = pd.DataFrame(non_dense_count_df.todense(), columns=cvec.get_feature_names())
# count_df.head()

In [40]:
lda_model = LDA(n_components=5, random_state=42)

In [41]:
lda_model.fit(non_dense_count_df)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [42]:
pyLDAvis.sklearn.prepare(lda_model, non_dense_count_df, cvec)

In [35]:
col_names = ['topic1', 'topic2', 'topic3', 'topic4', 'topic5']
lda_preds = pd.DataFrame(lda_model.transform(non_dense_count_df), columns=col_names)
lda_preds.head()

Unnamed: 0,topic1,topic2,topic3,topic4,topic5
0,0.136311,0.041874,0.039036,0.782521,0.000259
1,0.001306,0.001298,0.484313,0.511779,0.001304
2,0.002259,0.002252,0.991028,0.002232,0.002229
3,0.175706,0.001527,0.81971,0.001532,0.001525
4,0.148262,0.000771,0.849429,0.000769,0.000769


In [36]:
lda_topic_preds = []

for i in range(0, len(lda_preds)):
    sims = sorted(enumerate(lda_preds.iloc[i], 1), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
data['lda_preds'] = lda_topic_preds

In [37]:
def ldamodel(query):
    '''
    input: user query
    output: lda topic it belongs to
    '''
    count_query = cvec.transform(query)
    topic_likelihood = lda_model.transform(count_query)[0]
    topic = sorted(enumerate(topic_likelihood, 1), key=lambda x: x[1], reverse=True)[0][0]
    
    return topic

In [38]:
ldamodel(['realestate property mortgage fine'])

3

In [39]:
data.head()

Unnamed: 0,case_id,name,name_abbreviation,decision_date,first_page,last_page,case_citation_name,volume,reporter,court,jurisdiction,casebody,text,citations,clean_text,keywords,summs,d2v_scores,lda_preds
0,2747110,"The People of the State of Illinois, Plaintiff...",People v. Tobin,1771-10-12,538,543,2 Ill. App. 3d 538,2,"Illinois Appellate Court Reports, Third Series",Illinois Appellate Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Mr. PRESIDING JUSTICE EBERSPACHER\ndelivered t...,"['1 and 119', '89 S.Ct. 2100', '395 U.S. 959']",mr presiding justice eberspacher delivered opi...,"['sentenced', 'sentence', 'sentencing', 'court...",The court entered judgment upon the verdict an...,0.283677,4
1,435537,"James A. Whitesides and others, Plaintiffs in ...",Whitesides v. People,1819-12,21,22,1 Ill. 21,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court. This was a criminal pros...,,opinion court wa criminal prosecution riot pla...,"['indictment', 'form', 'criminal', 'shall', 'y...","This was a criminal prosecution for a riot, ag...",-0.579995,4
2,435638,"Amos Chipps, Appellant, v. Thomas Yancey, Appe...",Chipps v. Yancey,1819-12,19,19,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,"['2 Dall, 302']",opinion court wa action debt judgment rendered...,"['action', 'court', 'judgment', 'plea', 'nil']",Opinion of the Court.This was an action of deb...,-0.733031,3
3,435690,"Jonathan Taylor, Appellant, v. Michael Sprinkl...",Taylor v. Sprinkle,1819-12,17,18,1 Ill. 17,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n*\nThis was an action o...,,opinion court wa action covenant fifth plea st...,"['plea', 'court', 'post', 'failure', 'illinois']",Opinion of the Court.This was an action of cov...,-0.508408,3
4,435710,"François Coleen and Abraham Claypole, Appellan...",Coleen v. Figgins,1819-12,19,20,1 Ill. 19,1,Illinois Reports,Illinois Supreme Court,Illinois,"{'status': 'ok', 'data': {'opinions': [{'type'...",Opinion of the Court.\n†\nIt appears from the ...,,opinion court appears record cause writ issued...,"['court appears', 'scam', 'appearance', 'appea...","It appears from the record in this cause, that...",-0.371394,3


In [38]:
# data.to_csv('backup.csv')

In [47]:
# pickle.dump(lda_model, open('models/lda_final_model', 'wb'))
# pickle.dump(cvec, open('models/final_cvec_model', 'wb'))
# pickle.dump(model, open('models/d2v_final_model', 'wb'))

# Hierarchical LDA
<a id='HLDA'></a>

In [4]:
df = pd.read_csv('../data/final_legal_df.csv', index_col=0)

In [5]:
lda_1 = df[df.lda_preds == 1]
lda_1.head()

Unnamed: 0,case_id,decision_date,case_citation_name,court,citations,clean_text,keywords,summs,d2v_scores,lda_preds
105,435706,1827,1 Ill. 236,Illinois Supreme Court,,per curiam appellant failed file transcript re...,"['court', 'appeal', 'appellant', 'file transcr...",Per Curiam.\nThe appellant having failed to fi...,0.77706,1
296,2485505,1836,2 Ill. 327,Illinois Supreme Court,,per curiam motion denied motion denied,per curiam motion denied motion denied,Per Curiam:\nThe motion is denied.\nMotion den...,0.193602,1
517,2474939,1840,3 Ill. 454,Illinois Supreme Court,,smith justice delivered opinion court opinion ...,"['witness', 'continuance cause', 'subpoena', '...","Smith, Justice,delivered the opinion of the-Co...",1.114745,1
538,2476824,1840,3 Ill. 566,Illinois Supreme Court,,per curiam rule served copy return insufficien...,per curiam rule served copy return insufficien...,Per curiam:\nThe rule should have been served ...,0.6042,1
561,2478069,1840,3 Ill. 369,Illinois Supreme Court,,wilson chief justice granting motion could ben...,"['attorney', 'motion', 'right', 'permitting', ...","Wilson, Chief Justice:\nThe granting of this m...",0.730376,1


In [6]:
cvec = pickle.load(open('../models/final_cvec_model', 'rb'))
cvec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=5,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents='unicode', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [7]:
lda1_cvec = cvec.transform(lda_1.clean_text)

In [8]:
lda_model_1 = LDA(n_components=4, random_state=42)

In [9]:
lda_model_1.fit(lda1_cvec)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [10]:
pyLDAvis.sklearn.prepare(lda_model_1, lda1_cvec, cvec)

In [11]:
col_names = ['topic1a', 'topic1b', 'topic1c', 'topic1d']
lda_preds1 = pd.DataFrame(lda_model_1.transform(lda1_cvec), columns=col_names)
lda_preds1.head()

Unnamed: 0,topic1a,topic1b,topic1c,topic1d
0,0.972667,0.008968,0.00906,0.009305
1,0.889857,0.036169,0.036399,0.037575
2,0.632198,0.361437,0.00317,0.003195
3,0.939978,0.019778,0.019819,0.020425
4,0.880273,0.006522,0.106976,0.006229


In [12]:
lda_topic_preds = []

for i in range(0, len(lda_1)):
    sims = sorted(enumerate(lda_preds1.iloc[i], 1), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
lda_1['lda_sub_group'] = lda_topic_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [13]:
lda_2 = df[df.lda_preds == 2]
lda_2.head()

Unnamed: 0,case_id,decision_date,case_citation_name,court,citations,clean_text,keywords,summs,d2v_scores,lda_preds
1312,2574848,1849,11 Ill. 35,Illinois Supreme Court,,opinion treat c j action wa brought recover da...,"['warranty', 'defendant', 'instructed', 'instr...","Opinion by Treat, C.This action was brought to...",0.981997,2
1601,2581350,1851,13 Ill. 271,Illinois Supreme Court,['3 Pick. 380'],treat c j wa action case slander word laid dec...,"['malice', 'defendant', 'law', 'lawful', 'plai...",The court refused an instruction asked by the ...,3.236579,2
1802,2585158,1853,14 Ill. 324,Illinois Supreme Court,,catón j diversified character commerce questio...,"['contracted', 'contracting', 'custom', 'usage...",While the convenience of commerce may require ...,2.695893,2
2256,2592403,1855,17 Ill. 272,Illinois Supreme Court,['1 to p. 154'],scates c j question whether law agency ha corr...,"['instruction', 'agent act', 'agency', 'acting...",Power to act generally in a particular busines...,1.982165,2
2415,438554,1857,18 Ill. 488,Illinois Supreme Court,['2 ibid. 440'],skinner j wa action case railroad company comm...,"['company', 'carrying grain', 'reasonable', 'r...",This was an action on the case against the rai...,2.216215,2


In [18]:
lda2_cvec = cvec.transform(lda_2.clean_text)
lda_model_2 = LDA(n_components=3, random_state=42)
lda_model_2.fit(lda2_cvec)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=3, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [19]:
pyLDAvis.sklearn.prepare(lda_model_2, lda2_cvec, cvec)

In [20]:
col_names = ['topic2a', 'topic2b', 'topic2c']
lda_preds2 = pd.DataFrame(lda_model_2.transform(lda2_cvec), columns=col_names)
lda_preds2.head()

lda_topic_preds = []

for i in range(0, len(lda_2)):
    sims = sorted(enumerate(lda_preds2.iloc[i], 5), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
lda_2['lda_sub_group'] = lda_topic_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [22]:
lda_3 = df[df.lda_preds == 3]
lda_3.head()

lda3_cvec = cvec.fit_transform(lda_3.clean_text)
lda_model_3 = LDA(n_components=4, random_state=42)
lda_model_3.fit(lda3_cvec)

pyLDAvis.sklearn.prepare(lda_model_3, lda3_cvec, cvec)

In [24]:
col_names = ['topic3a', 'topic3b', 'topic3c', 'topic3d']
lda_preds3 = pd.DataFrame(lda_model_3.transform(lda3_cvec), columns=col_names)
lda_preds3.head()

lda_topic_preds = []

for i in range(0, len(lda_3)):
    sims = sorted(enumerate(lda_preds3.iloc[i], 8), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
lda_3['lda_sub_group'] = lda_topic_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [25]:
lda_4 = df[df.lda_preds == 4]
lda_4.head()

lda4_cvec = cvec.fit_transform(lda_4.clean_text)
lda_model_4 = LDA(n_components=3, random_state=42)
lda_model_4.fit(lda4_cvec)

pyLDAvis.sklearn.prepare(lda_model_4, lda4_cvec, cvec)

In [27]:
col_names = ['topic4a', 'topic4b', 'topic4c']
lda_preds4 = pd.DataFrame(lda_model_4.transform(lda4_cvec), columns=col_names)
lda_preds4.head()

lda_topic_preds = []

for i in range(0, len(lda_4)):
    sims = sorted(enumerate(lda_preds4.iloc[i], 12), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
lda_4['lda_sub_group'] = lda_topic_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [30]:
lda_5 = df[df.lda_preds == 5]
lda_5.head()

lda5_cvec = cvec.fit_transform(lda_5.clean_text)
lda_model_5 = LDA(n_components=3, random_state=42)
lda_model_5.fit(lda5_cvec)

pyLDAvis.sklearn.prepare(lda_model_5, lda5_cvec, cvec)

In [31]:
col_names = ['topic5a', 'topic5b', 'topic5c']
lda_preds5 = pd.DataFrame(lda_model_5.transform(lda5_cvec), columns=col_names)
lda_preds5.head()

lda_topic_preds = []

for i in range(0, len(lda_5)):
    sims = sorted(enumerate(lda_preds5.iloc[i], 15), key=lambda x: x[1], reverse=True)
    lda_topic_preds.append(sims[0][0])
    
lda_5['lda_sub_group'] = lda_topic_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [32]:
lda_df = pd.concat([lda_1, lda_2, lda_3, lda_4, lda_5])

In [33]:
lda_df['lda_sub_group'].value_counts()

10    25569
9     19517
8     17778
11    16093
6     15442
15    12101
1      8998
17     8847
12     8829
3      8805
14     8671
13     7091
4      6546
7      5922
5      4900
2      4174
16     3866
Name: lda_sub_group, dtype: int64

# LDA sub-topics:

Topic 1: complaint, department, school, district, employee, administrative, statute

Topic 2: petition, child, marriage, property, support

Topic 3: child, parent, custody, mental, care, minor

Topic 4: sentence, conviction, guilty, plea, probation, record

Topic 5: complaint, damage, claim, agreetment

Topic 6: insurance, insurer, company, coverage, liability, claim 

Topic 7: city, ordinance, commission, building, lease, zoning

Topic 8: estate, deed, property, trust, mortgage, land

Topic 9: company, bank, stock, corporation, contact, business

Topic 10: error, filed, bill, record, petition

Topic 11: tax, contract, propery, count, land, board, assessment

Topic 12: testimony, testified, witness, police, jury, victim

Topic 13: jury, sentence, guilty, murder, criminal, record

Topic 14: officer, police, car, search, arrest, found, vehicle, warrant, possession

Topic 15: car, negligence, injury, accident

Topic 16: claimant, injury, doctor, accident, employee, work, industrial, medical, compensation, hospital, claim

Topic 17: street, city, property, company, damage, building, company, owner, foot, road, land

In [54]:
data = lda_df.sort_index()

# Final Revisions to Dataframe
<a id='FR'></a>

In [55]:
data.head(2)

Unnamed: 0,case_id,decision_date,case_citation_name,court,citations,clean_text,keywords,summs,d2v_scores,lda_preds,lda_sub_group
0,2747110,1771,2 Ill. App. 3d 538,Illinois Appellate Court,"['1 and 119', '89 S.Ct. 2100', '395 U.S. 959']",mr presiding justice eberspacher delivered opi...,"['sentenced', 'sentence', 'sentencing', 'court...",The court entered judgment upon the verdict an...,1.3064,4,13
1,435537,1819,1 Ill. 21,Illinois Supreme Court,,opinion court wa criminal prosecution riot pla...,"['indictment', 'form', 'criminal', 'shall', 'y...","This was a criminal prosecution for a riot, ag...",1.451599,4,12


In [None]:
new_date = []

for item in data.decision_date:
    new_date.append(re.findall(r'\d{4}', item)[0])

In [32]:
data['decision_date'] = new_date

In [33]:
data['decision_date'] = data.decision_date.astype(int)

In [34]:
data = data[['case_id', 'decision_date', 'case_citation_name', 'court', 'citations', 'clean_text', 'keywords', 'summs', 'd2v_scores', 'lda_preds']]

In [35]:
print(df.shape)
print(data.shape)

(183149, 10)
(183149, 19)


In [36]:
data.head(2)

Unnamed: 0,case_id,decision_date,case_citation_name,court,citations,clean_text,keywords,summs,d2v_scores,lda_preds
0,2747110,1771,2 Ill. App. 3d 538,Illinois Appellate Court,"['1 and 119', '89 S.Ct. 2100', '395 U.S. 959']",mr presiding justice eberspacher delivered opi...,"['sentenced', 'sentence', 'sentencing', 'court...",The court entered judgment upon the verdict an...,1.3064,4
1,435537,1819,1 Ill. 21,Illinois Supreme Court,,opinion court wa criminal prosecution riot pla...,"['indictment', 'form', 'criminal', 'shall', 'y...","This was a criminal prosecution for a riot, ag...",1.451599,4


In [57]:
# data.to_csv('../data/final_legal_df.csv')