In [132]:
import pandas as pd
import numpy as np 
import nltk
import sklearn
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup

In [133]:
df = pd.read_csv('./data/data/justice.csv')

In [134]:
df["disposition"].value_counts()

reversed/remanded            1073
affirmed                     1007
reversed                      585
vacated/remanded              430
reversed in-part/remanded      61
none                           41
reversed in-part               22
vacated                         7
vacated in-part/remanded        5
Name: disposition, dtype: int64

In [135]:
df.isnull().sum()

Unnamed: 0              0
ID                      0
name                    0
href                    0
docket                  0
term                    0
first_party             1
second_party            1
facts                   0
facts_len               0
majority_vote           0
minority_vote           0
first_party_winner     15
decision_type           7
disposition            72
issue_area            142
dtype: int64

In [136]:
df["issue_area"].value_counts()

Criminal Procedure      859
Civil Rights            568
Economic Activity       542
First Amendment         353
Judicial Power          342
Due Process             128
Federalism              125
Privacy                  70
Unions                   60
Federal Taxation         51
Attorneys                37
Miscellaneous            20
Private Action            4
Interstate Relations      2
Name: issue_area, dtype: int64

In [137]:
df.dropna(inplace=True)

In [138]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
1,1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights


## Clean up the data

drop unnecessary columns

In [139]:
df.drop(columns=['Unnamed: 0', 'href'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3098 entries, 1 to 3302
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          3098 non-null   int64 
 1   ID                  3098 non-null   int64 
 2   name                3098 non-null   object
 3   href                3098 non-null   object
 4   docket              3098 non-null   object
 5   term                3098 non-null   object
 6   first_party         3098 non-null   object
 7   second_party        3098 non-null   object
 8   facts               3098 non-null   object
 9   facts_len           3098 non-null   int64 
 10  majority_vote       3098 non-null   int64 
 11  minority_vote       3098 non-null   int64 
 12  first_party_winner  3098 non-null   object
 13  decision_type       3098 non-null   object
 14  disposition         3098 non-null   object
 15  issue_area          3098 non-null   object
dtypes: int64(5), object(11)


## Cleaning up the facts column

In [140]:
# combining name and fact column
df['fact'] = df.apply(lambda row: row['name'] + ' ' + row['facts'], axis=1)

df.head(5)

Unnamed: 0.1,Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,fact
1,1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights,Stanley v. Illinois <p>Joan Stanley had three ...
2,2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process,Giglio v. United States <p>John Giglio was con...
3,3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights,Reed v. Reed <p>The Idaho Probate Code specifi...
4,4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment,"Miller v. California <p>Miller, after conducti..."
5,5,50644,Kleindienst v. Mandel,https://api.oyez.org/cases/1971/71-16,71-16,1971,"Richard G. Kleindienst, Attorney General of th...","Ernest E. Mandel, et al.",<p>Ernest E. Mandel was a Belgian professional...,2282,6,3,True,majority opinion,reversed,First Amendment,Kleindienst v. Mandel <p>Ernest E. Mandel was ...


In [141]:
# creating a function to remove html tags
for index, row in df.iterrows():
    soup = BeautifulSoup(row["fact"], 'html.parser')
    text = (soup.get_text()).lower()
    cleaned_text = re.sub(r'[^\w\s]', "", text.replace("\n", ""))
    df.at[index, 'fact'] = cleaned_text


#### Removing unneeded columns 

In [142]:
#remove redudant columns
df.drop(columns=['Unnamed: 0', 'href', 'facts', 'name'], inplace=True)

#set index
df.set_index('ID')
df.head(3)

Unnamed: 0,ID,docket,term,first_party,second_party,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area,fact
1,50613,70-5014,1971,"Peter Stanley, Sr.",Illinois,757,5,2,True,majority opinion,reversed/remanded,Civil Rights,stanley v illinois joan stanley had three chil...
2,50623,70-29,1971,John Giglio,United States,495,7,0,True,majority opinion,reversed/remanded,Due Process,giglio v united states john giglio was convict...
3,50632,70-4,1971,Sally Reed,Cecil Reed,378,7,0,True,majority opinion,reversed/remanded,Civil Rights,reed v reed the idaho probate code specified t...


In [143]:
df['fact'][1]

'stanley v illinois joan stanley had three children with peter stanley  the stanleys never married but lived together off and on for 18 years  when joan died the state of illinois took the children  under illinois law unwed fathers were presumed unfit parents regardless of their actual fitness and their children became wards of the state  peter appealed the decision arguing that the illinois law violated the equal protection clause of the fourteenth amendment because unwed mothers were not deprived of their children without a showing that they were actually unfit parents  the illinois supreme court rejected stanleys equal protection claim holding that his actual fitness as a parent was irrelevant because he and the childrens mother were unmarried'