In [1]:
import os
from glob import glob
import dask
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
from dask import delayed
import pandas as pd
from distributed import Client
from dask_jobqueue import SLURMCluster
from IPython.display import display
import matplotlib.pyplot as plt
import time
import numpy as np
import pyarrow
from dask.diagnostics import ProgressBar
import time
import csv
from tqdm import tqdm
from tqdm.notebook import tqdm
from IPython.display import display
import io
import sys
from dask.diagnostics import ProgressBar


# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = False


if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(
                           # Memory and core limits should be sufficient here
                           memory='64GB', cores=8,

                            # Ensure that Dask uses the correct version of Python on the cluster
                            python='/scratch/work/public/dask/{}/bin/python'.format(dask.__version__),                           
                           
                            # Place the output logs in an accessible location
                            job_extra=['--export=NONE --output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])]
    )

    cluster.submit_command = 'slurm'
    cluster.scale(200) 

    display(cluster)
    client = Client(cluster)

display(client)



  from distributed.utils import tmpfile


Tab(children=(HTML(value='<div class="jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-outpu…

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.32.35.51:8787/status,

0,1
Dashboard: http://10.32.35.51:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.32.35.51:35609,Workers: 0
Dashboard: http://10.32.35.51:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [2]:
opinion_schema = {
    'resource_uri': 'object',
    'id': 'object',
    'absolute_url': 'object',
    'cluster_id': 'object',
    'cluster': 'object',
    'author_id': 'object',
    'author': 'object',
    'joined_by': 'object',
    'date_created': 'object',
    'date_modified': 'object',
    'author_str': 'object',
    'per_curiam': 'str',
    'joined_by_str': 'object',
    'type': 'object',
    'sha1': 'object',
    'page_count': 'str',
    'download_url': 'object',
    'local_path': 'object',
    'plain_text': 'object',
    'html': 'object',
    'html_lawbox': 'object',
    'html_columbia': 'object',
    'html_anon_2020': 'object',
    'xml_harvard': 'object',
    'html_with_citations': 'object',
    'extracted_by_ocr': 'str',
    'opinions_cited': 'object'
}


opinion_clusters_schema = {
    "resource_uri": "string",
    "id": "string",
    "absolute_url": "string",
    "panel": "string",
    "non_participating_judges": "string",
    "docket_id": "string",
    "docket": "string",
    "sub_opinions": "string",
    "citations": "string",
    "date_created": "string",
    "date_modified": "string",
    "judges": "string",
    "date_filed": "string",
    "date_filed_is_approximate": "string",
    "slug": "string",
    "case_name_short": "string",
    "case_name": "string",
    "case_name_full": "string",
    "scdb_id": "string",
    "scdb_decision_direction": "string",
    "scdb_votes_majority": "string",
    "scdb_votes_minority": "string",
    "source": "string",
    "procedural_history": "string",
    "attorneys": "string",
    "nature_of_suit": "string",
    "posture": "string",
    "syllabus": "string",
    "headnotes": "string",
    "summary": "string",
    "disposition": "string",
    "history": "string",
    "other_dates": "string",
    "cross_reference": "string",
    "correction": "string",
    "citation_count": "int",
    "precedential_status": "string",
    "date_blocked": "string",
    "blocked": "string",
    "filepath_json_harvard": "string",
    "arguments": "string",
    "headmatter": "string"
}


In [None]:
# Write cluster data for the lower courts
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

input_csv_file = '/vast/amh9750/clusters-data.csv' #Update File Path
output_csv_file = '/vast/amh9750/opinions-cluster-data-lc.csv' #Update File Path

# Create a new CSV file for writing
with open(output_csv_file, 'w', newline='', encoding='latin1') as csvfile:
    
    # Define the fieldnames for your CSV
    fieldnames = list(opinion_clusters_schema.keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open(input_csv_file, 'r', encoding='latin1') as f:
        reader = csv.DictReader(f)

        for i, sentence in enumerate(tqdm(reader)):
            if sentence['scdb_id'] == "" and sentence['date_filed']>='1930-01-01':
                # Write the filtered row to the new CSV file
                writer.writerow(sentence)
             #if i == 1000000:
                #break

In [None]:
# Write cluster data for filtering for the cluster ids we DON'T want 
# i.e. clusters that are either supreme court or before 1930
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

input_csv_file = '/vast/amh9750/clusters-data.csv' #Update File Path: Clusters Datset
output_csv_file = '/vast/amh9750/opinions-cluster-data-filtering.csv' #Update File Path: Clusters Datset Filtering

# Create a new CSV file for writing
with open(output_csv_file, 'w', newline='', encoding='latin1') as csvfile:
    
    # Define the fieldnames for your CSV
    fieldnames = list(opinion_clusters_schema.keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open(input_csv_file, 'r', encoding='latin1') as f:
        reader = csv.DictReader(f)

        for i, sentence in enumerate(tqdm(reader)):
            if sentence['scdb_id'] != "" or sentence['date_filed']<'1930-01-01':
                # Write the filtered row to the new CSV file
                writer.writerow(sentence)
             #if i == 1000000:
                 #break

In [3]:
# Read in the filtering csv
cluster_df_filter = pd.read_csv('/vast/amh9750/opinions-cluster-data-filtering.csv', dtype = opinion_clusters_schema)

In [4]:
# Extract the values of 'id' for filtering
filter_court_case_cluster_ids = cluster_df_filter['id'].tolist()

# Convert the ids to strings for filtering
filter_cluster_ids_map = map(str, filter_court_case_cluster_ids)
filter_court_case_cluster_ids = list(filter_cluster_ids_map)

In [5]:
# Check size of filter list
len(filter_court_case_cluster_ids)

1642080

In [6]:
# Make the id list into a set for more efficient filtering
filter_set = set(filter_court_case_cluster_ids)

In [7]:
# Filter and write opinion data for lower courts from 1930 onwards
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
csv.field_size_limit(sys.maxsize)

input_csv_file = '/vast/amh9750/opinions-data.csv' #Update File Path: Opinions Text Dataset
output_csv_file = '/vast/amh9750/opinions-data-lc.csv' #Update File Path: Opinions Text Dataset Lower Courts

# Create a new CSV file for writing
with open(output_csv_file, 'w', newline='', encoding='latin1') as csvfile:
    
    # Define the fieldnames for your CSV
    fieldnames = list(opinion_schema.keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    with open(input_csv_file, 'r', encoding='latin1') as f:
        reader = csv.DictReader(f)

        for i, sentence in enumerate(tqdm(reader)):
            # Take only opinions that have a documented author and are not in the filter set
            if (sentence['cluster_id'] not in filter_set) and (sentence['author_id'] != "" or sentence['author_str'] != ""):
                # Write the filtered row to the new CSV file
                writer.writerow(sentence)
            #if i == 1000000:
                #break

0it [00:00, ?it/s]