In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import plotly.express as px
import os
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scotus_dataset = load_dataset("coastalcph/fairlex", name="scotus")
scotus_dataset

Found cached dataset fairlex (/home/kyle/.cache/huggingface/datasets/coastalcph___fairlex/scotus/1.0.0/b755f714459ab788a8e3f9167fe7463f79981775296915d36ac10fc58ea93737)
100%|██████████| 3/3 [00:00<00:00, 655.70it/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'decision_direction', 'respondent_type'],
        num_rows: 7417
    })
    test: Dataset({
        features: ['text', 'label', 'decision_direction', 'respondent_type'],
        num_rows: 931
    })
    validation: Dataset({
        features: ['text', 'label', 'decision_direction', 'respondent_type'],
        num_rows: 914
    })
})

In [3]:
train_frame = scotus_dataset['train'].to_pandas()
display(train_frame.head())
display(train_frame.value_counts('label'))

validation_frame = scotus_dataset['validation'].to_pandas()
display(validation_frame.head())
display(validation_frame.value_counts('label'))

test_frame = scotus_dataset['test'].to_pandas()
display(test_frame.head())
display(test_frame.value_counts('label'))

Unnamed: 0,text,label,decision_direction,respondent_type
0,United States Supreme Court MICHIGAN NAT. BANK...,9,0,3
1,United States Supreme Court NEW YORK v. CATHED...,2,1,2
2,United States Supreme Court HAZEN PAPER CO. v....,1,0,1
3,United States Supreme Court SIMPSON v. FLORIDA...,0,1,3
4,United States Supreme Court MINISTRY OF DEFENS...,7,1,0


label
0     1570
7     1569
1     1219
8     1073
2      601
9      333
6      323
3      292
10     276
4       95
5       66
Name: count, dtype: int64

Unnamed: 0,text,label,decision_direction,respondent_type
0,United States Supreme Court FTC v. BORDEN CO.(...,7,1,0
1,United States Supreme Court BOESCHE v. UDALL(1...,7,1,3
2,United States Supreme Court DOREMUS v. BOARD O...,8,0,3
3,United States Supreme Court NLRB v. IRON WORKE...,6,0,2
4,United States Supreme Court UNITED STATES v. H...,2,0,2


label
0     193
7     192
1     154
8     127
2      73
9      49
10     40
3      36
6      27
5      12
4      11
Name: count, dtype: int64

Unnamed: 0,text,label,decision_direction,respondent_type
0,United States Supreme Court SCHNEIDER v. SMITH...,2,1,3
1,United States Supreme Court BELLOTTI v. BAIRD(...,8,0,0
2,United States Supreme Court EMSPAK v. UNITED S...,0,1,3
3,United States Supreme Court INS v. CARDOZA-FON...,1,1,1
4,United States Supreme Court CALIFANO v. AZNAVO...,1,0,3


label
0     215
7     182
1     162
8     114
2      76
6      48
3      47
9      35
10     33
5      10
4       9
Name: count, dtype: int64

In [4]:
import re
year_pattern = r"\((\d{4})\)"
legal_text = train_frame.iloc[4]["text"]
print(legal_text)
int(re.search(year_pattern, legal_text).group(1))

United States Supreme Court MINISTRY OF DEFENSE AND SUPPORT FOR THE ARMED FORCES OF THE ISLAMIC REPUBLIC OF IRAN v. DARIUSH ELAHI(2006) No. 04-1095 Argued: Decided: February 21, 2006 </s> Per Curiam. </s> A private citizen seeks to attach an asset belonging to Iran's Ministry of Defense in order to help satisfy a judgment for money damages.  The question raised is whether the Foreign Sovereign Immunities Act of 1976 (FSIA or Act), 28 U.S.C. §1602 et seq. (2000 ed. and Supp. III), forbids that attachment. </s> The judgment for money damages consists of a default judgment against the Islamic Republic of Iran (for about $300 million) that the private citizen, Dariush Elahi, obtained in a federal-court lawsuit claiming that the Republic had murdered his brother.  Elahi v. Islamic Republic of Iran, 124 F.Supp. 2d 97, 103 (DC 2000).  The asset is an arbitration award (against a third party), which Iran's Ministry of Defense obtained in Switzerland.  Ministry of Defense and Support for Armed 

2006

In [10]:
year_pattern = r"\((\d{4})\)"
all_data = pd.concat([train_frame, validation_frame, test_frame])
all_data["year"] = all_data["text"].apply(lambda x: int(re.search(year_pattern, x).group(1)))

# order frame by year ascending
all_data = all_data.sort_values(by="year", ascending=True)
display(all_data)

px.histogram(all_data, x="year", color="label")


Unnamed: 0,text,label,decision_direction,respondent_type,year
1839,United States Supreme Court AMERICAN FEDERATIO...,6,0,1,1912
5115,United States Supreme Court VANSTON BONDHOLDER...,7,1,0,1946
4210,United States Supreme Court UNITED STATES v. A...,1,1,0,1946
6633,United States Supreme Court UNITED STATES v. B...,0,0,0,1946
4198,United States Supreme Court U.S. v. RUZICKA(19...,7,1,0,1946
...,...,...,...,...,...
850,United States Supreme Court RIVERA v. ILLINOIS...,0,0,3,2009
6106,United States Supreme Court ENTERGY CORP. v. R...,7,0,2,2009
3245,United States Supreme Court MELENDEZ-DIAZ v. M...,0,1,3,2009
3377,United States Supreme Court BOYLE v. UNITED ST...,0,0,3,2009


In [11]:
# get first 80% of the frame as the training set and the latter 20% as the test set. Add all entries from the largest year in the training set to the test set.
train_size = int(len(all_data) * 0.8)
train_data = all_data.iloc[:train_size]
test_data = all_data.iloc[train_size:]

display(train_data)
display(test_data)


Unnamed: 0,text,label,decision_direction,respondent_type,year
1839,United States Supreme Court AMERICAN FEDERATIO...,6,0,1,1912
5115,United States Supreme Court VANSTON BONDHOLDER...,7,1,0,1946
4210,United States Supreme Court UNITED STATES v. A...,1,1,0,1946
6633,United States Supreme Court UNITED STATES v. B...,0,0,0,1946
4198,United States Supreme Court U.S. v. RUZICKA(19...,7,1,0,1946
...,...,...,...,...,...
6076,United States Supreme Court UNITED STATES v. R...,0,0,2,1991
5885,United States Supreme Court UNITED STATES v. G...,7,1,2,1991
4294,United States Supreme Court CLARK v. ROEMER(19...,1,1,0,1991
124,United States Supreme Court OHIO v. HUERTAS(19...,8,0,0,1991


Unnamed: 0,text,label,decision_direction,respondent_type,year
6065,United States Supreme Court LITTON FINANCIAL P...,6,0,3,1991
73,"United States Supreme Court MASTER, MATES & PI...",6,0,0,1991
1131,United States Supreme Court UNITED STATES v. C...,10,0,2,1991
3296,United States Supreme Court MICHIGAN v. LUCAS(...,0,0,1,1991
4382,United States Supreme Court LEATHERS v. MEDLOC...,2,0,3,1991
...,...,...,...,...,...
850,United States Supreme Court RIVERA v. ILLINOIS...,0,0,3,2009
6106,United States Supreme Court ENTERGY CORP. v. R...,7,0,2,2009
3245,United States Supreme Court MELENDEZ-DIAZ v. M...,0,1,3,2009
3377,United States Supreme Court BOYLE v. UNITED ST...,0,0,3,2009


In [13]:
test_data = all_data[all_data["year"] >= 2000]
display(test_data)

# get all entries not in the test set
train_data = all_data[~all_data.index.isin(test_data.index)]
display(train_data)


Unnamed: 0,text,label,decision_direction,respondent_type,year
6471,United States Supreme Court GEIER et al. v. AM...,9,1,2,2000
6127,United States Supreme Court MOBIL OIL EXPLORAT...,7,0,3,2000
3611,United States Supreme Court BOND v. UNITED STA...,0,1,3,2000
3927,United States Supreme Court JONES v. UNITED ST...,0,1,3,2000
2185,"United States Supreme Court EDWARDS, WARDEN v....",0,0,1,2000
...,...,...,...,...,...
850,United States Supreme Court RIVERA v. ILLINOIS...,0,0,3,2009
6106,United States Supreme Court ENTERGY CORP. v. R...,7,0,2,2009
3245,United States Supreme Court MELENDEZ-DIAZ v. M...,0,1,3,2009
3377,United States Supreme Court BOYLE v. UNITED ST...,0,0,3,2009


Unnamed: 0,text,label,decision_direction,respondent_type,year
1839,United States Supreme Court AMERICAN FEDERATIO...,6,0,1,1912
5115,United States Supreme Court VANSTON BONDHOLDER...,7,1,0,1946
4210,United States Supreme Court UNITED STATES v. A...,1,1,0,1946
6633,United States Supreme Court UNITED STATES v. B...,0,0,0,1946
4198,United States Supreme Court U.S. v. RUZICKA(19...,7,1,0,1946
...,...,...,...,...,...
2306,"United States Supreme Court CLINTON, PRESIDENT...",8,0,1,1999
4112,United States Supreme Court CLEVELAND v. POLI...,1,1,1,1999
139,"United States Supreme Court BUCKLEY, SECRETARY...",2,1,2,1999
5892,United States Supreme Court GRUPO MEXICANO de ...,8,0,0,1999


In [14]:
# Relabel samples in classes {First Amendment (2), Due Process (3)} as Civil Rights (1).
# Relabel samples in Unions (6) as Economic Activity (7).

display(train_data["label"].value_counts())
train_data["label"] = train_data["label"].replace({2: 1, 3: 1, 6: 7})
display(train_data["label"].value_counts())



label
7     1679
0     1646
1     1304
8     1136
2      630
6      369
9      342
3      324
10     313
4       85
5       76
Name: count, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



label
1     2258
7     2048
0     1646
8     1136
9      342
10     313
4       85
5       76
Name: count, dtype: int64

In [None]:
train_data.to_csv("../datasets/scotus_train.csv", index=False)
test_data.to_csv("../datasets/scotus_test.csv", index=False)