In [1]:
import os
from glob import glob
import dask
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
from dask import delayed
import pandas as pd
from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display
import matplotlib.pyplot as plt
import time
import numpy as np
import pyarrow
from dask.diagnostics import ProgressBar
import time
import csv
from tqdm import tqdm
import io
import sys
from dask.diagnostics import ProgressBar




# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = False


if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(
                           # Memory and core limits should be sufficient here
                           memory='64GB', cores=8,

                            # Ensure that Dask uses the correct version of Python on the cluster
                            python='/scratch/work/public/dask/{}/bin/python'.format(dask.__version__),                           
                           
                            # Place the output logs in an accessible location
                            job_extra=['--export=NONE --output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])]
    )

    cluster.submit_command = 'slurm'
    cluster.scale(200) 

    display(cluster)
    client = Client(cluster)

display(client)





  from distributed.utils import tmpfile


Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.32.33.52:8787/status,

0,1
Dashboard: http://10.32.33.52:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.32.33.52:33983,Workers: 0
Dashboard: http://10.32.33.52:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [3]:
opinion_schema = {
    'resource_uri': 'object',
    'id': 'object',
    'absolute_url': 'object',
    'cluster_id': 'object',
    'cluster': 'object',
    'author_id': 'object',
    'author': 'object',
    'joined_by': 'object',
    'date_created': 'object',
    'date_modified': 'object',
    'author_str': 'object',
    'per_curiam': 'str',
    'joined_by_str': 'object',
    'type': 'object',
    'sha1': 'object',
    'page_count': 'str',
    'download_url': 'object',
    'local_path': 'object',
    'plain_text': 'object',
    'html': 'object',
    'html_lawbox': 'object',
    'html_columbia': 'object',
    'html_anon_2020': 'object',
    'xml_harvard': 'object',
    'html_with_citations': 'object',
    'extracted_by_ocr': 'str',
    'opinions_cited': 'object'
}


opinion_clusters_schema = {
    "resource_uri": "string",
    "id": "string",
    "absolute_url": "string",
    "panel": "string",
    "non_participating_judges": "string",
    "docket_id": "string",
    "docket": "string",
    "sub_opinions": "string",
    "citations": "string",
    "date_created": "string",
    "date_modified": "string",
    "judges": "string",
    "date_filed": "string",
    "date_filed_is_approximate": "string",
    "slug": "string",
    "case_name_short": "string",
    "case_name": "string",
    "case_name_full": "string",
    "scdb_id": "string",
    "scdb_decision_direction": "string",
    "scdb_votes_majority": "string",
    "scdb_votes_minority": "string",
    "source": "string",
    "procedural_history": "string",
    "attorneys": "string",
    "nature_of_suit": "string",
    "posture": "string",
    "syllabus": "string",
    "headnotes": "string",
    "summary": "string",
    "disposition": "string",
    "history": "string",
    "other_dates": "string",
    "cross_reference": "string",
    "correction": "string",
    "citation_count": "int",
    "precedential_status": "string",
    "date_blocked": "string",
    "blocked": "string",
    "filepath_json_harvard": "string",
    "arguments": "string",
    "headmatter": "string"
}


In [5]:
#Write Cluster Data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

input_csv_file = '/vast/amr10211/opinions-cluster-data.csv' #Update File Path
output_csv_file = '/vast/amr10211/opinions-cluster-data-sc.csv' #Update File Path

# Create a new CSV file for writing
with open(output_csv_file, 'w', newline='', encoding='latin1') as csvfile:
    # Define the fieldnames for your CSV
    fieldnames = list(opinion_clusters_schema.keys())

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open(input_csv_file, 'r', encoding='latin1') as f:
        reader = csv.DictReader(f)

        for i, sentence in enumerate(tqdm(reader)):
            if sentence['scdb_id'] != "":
                # Write the filtered row to the new CSV file
                writer.writerow(sentence)
#             if i == 1000000:
#                 break

8961877it [02:15, 65948.61it/s] 


In [13]:
#Get Cluster Data
cluster_df = pd.read_csv('/vast/amr10211/opinions-cluster-data-sc.csv') #Update File Path

In [14]:
# Extract values of 'id' from the dictionaries in 'cluster_df'
supreme_court_case_cluster_ids = cluster_df['id'].tolist()

In [14]:
#Filter and Write Opinion Data
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

input_csv_file = '/vast/amr10211/opinions-data.csv' #Update File Path
output_csv_file = '/vast/amr10211/opinions-data-sc.csv' #Update File Path

# Create a new CSV file for writing
with open(output_csv_file, 'w', newline='', encoding='latin1') as csvfile:
    # Define the fieldnames for your CSV
    fieldnames = list(opinion_schema.keys())

    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open(input_csv_file, 'r', encoding='latin1') as f:
        reader = csv.DictReader(f)

        for i, sentence in enumerate(tqdm(reader)):
            if sentence['cluster_id'] in supreme_court_case_cluster_ids:
                # Write the filtered row to the new CSV file
                writer.writerow(sentence)
            # if i == 1000000:
            #     break


9592141it [1:30:01, 1775.70it/s]


In [5]:
#Get SC Opinion Data
filtered_df = pd.read_csv('/vast/amr10211/opinions-data-sc.csv', dtype = opinion_schema) #Update File Path

In [6]:
filtered_df.shape

(45508, 27)

In [7]:
filtered_df.count()

resource_uri               0
id                     45508
absolute_url               0
cluster_id             45508
cluster                    0
author_id              21989
author                     0
joined_by                  0
date_created           45508
date_modified          45508
author_str             34550
per_curiam             45508
joined_by_str              0
type                   45508
sha1                   28111
page_count              1099
download_url            3444
local_path              1103
plain_text              1099
html                   26459
html_lawbox            25484
html_columbia              0
html_anon_2020             0
xml_harvard            35053
html_with_citations    28106
extracted_by_ocr       45508
opinions_cited             0
dtype: int64

In [21]:
# Assuming your data is in a DataFrame named filtered_df

columns_to_check =['html', 'xml_harvard'] #['plain_text', 'html', 'html_lawbox', 'xml_harvard', 'html_with_citations']
                  
# Create boolean masks to identify null values
null_masks = filtered_df[columns_to_check].isnull()

# Check if any one column is null while the others are populated
one_null_others_not = null_masks.sum(axis=1) == 1

# Check if all columns are null
all_null = null_masks.all(axis=1)

# Check if all columns are populated
all_populated = null_masks.sum(axis=1) == 0

# Check conditions and count occurrences
count_one_null_others_not = one_null_others_not.sum()
count_all_null = all_null.sum()
count_all_populated = all_populated.sum()

print(columns_to_check[0],'vs',columns_to_check[1])
print(f"Rows where one column is null while the other is populated: {count_one_null_others_not}")
print(f"Rows where both columns are null: {count_all_null}")
print(f"Rows where both columns are populated: {count_all_populated}")

html vs xml_harvard
Rows where one column is null while the other is populated: 27268
Rows where both columns are null: 1118
Rows where both columns are populated: 17122


In [22]:
# Assuming your data is in a DataFrame named filtered_df

columns_to_check = ['plain_text', 'html', 'html_lawbox', 'xml_harvard', 'html_with_citations']
                  
# Create boolean masks to identify null values
null_masks = filtered_df[columns_to_check].isnull()

# Check if any one column is null while the others are populated
one_null_others_not = null_masks.sum(axis=1) == 1

# Check if all columns are null
all_null = null_masks.all(axis=1)

# Check if all columns are populated
all_populated = null_masks.sum(axis=1) == 0

# Check conditions and count occurrences
count_one_null_others_not = one_null_others_not.sum()
count_all_null = all_null.sum()
count_all_populated = all_populated.sum()

print('All text fields:', columns_to_check)
print(f"Rows where one column is null while the others are populated: {count_one_null_others_not}")
print(f"Rows where all columns are null: {count_all_null}")
print(f"Rows where all columns are populated: {count_all_populated}")

All text fields: ['plain_text', 'html', 'html_lawbox', 'xml_harvard', 'html_with_citations']
Rows where one column is null while the others are populated: 16116
Rows where all columns are null: 4
Rows where all columns are populated: 0


In [17]:
#Get Cluster Data
cluster_df = pd.read_csv('/vast/amr10211/opinions-cluster-data-sc.csv') #Update File Path

In [18]:
cluster_df.shape

(28111, 42)

In [19]:
cluster_df.count()

resource_uri                     0
id                           28111
absolute_url                     0
panel                            0
non_participating_judges         0
docket_id                    28111
docket                           0
sub_opinions                     0
citations                        0
date_created                 28111
date_modified                28111
judges                       26941
date_filed                   28111
date_filed_is_approximate    28111
slug                         28111
case_name_short              22086
case_name                    28111
case_name_full               24433
scdb_id                      28111
scdb_decision_direction      28100
scdb_votes_majority          28111
scdb_votes_minority          28111
source                       28111
procedural_history               0
attorneys                    23455
nature_of_suit                   0
posture                          0
syllabus                      8409
headnotes           

In [21]:
cluster_df['scdb_decision_direction'].groupby(cluster_df['scdb_decision_direction']).count()

scdb_decision_direction
1.0    11625
2.0    12764
3.0     3711
Name: scdb_decision_direction, dtype: int64