In [None]:
from google.colab import drive # Uncomment first time running notebook
drive.mount('/content/drive') # Uncomment first time running notebook
!ls "/content/drive/Shareddrives/Advanced ML Project Spring 2021"

Mounted at /content/drive
 01b_cnn_offline.ipynb			   naive_bayes.ipynb
'2 - Upgraded Sentiment Analysis.ipynb'   'Nathan Lit Review.gdoc'
 charlie_midterm_presentation_backup.mp4   notebooks
'Data sources.gdoc'			  'Project To-Do List.gdoc'
'Final presentation.gslides'		  'Proposal ideas.gdoc'
'Final report.gdoc'			  'Proposal (Rough Draft).gdoc'
 intermediate_data			   raw_data
 LSTM.ipynb				   RNN.ipynb
'Mid-quarter presentation.gslides'	   scraped_data
 naive_bayes_final.ipynb


**Functions to load data (descriptions in docstrings)**

In [None]:
# follows https://github.com/walkerdb/supreme_court_transcripts
import os
import urllib.request, json
import numpy as np
import pandas as pd
import string

SHARED_DRIVE_BASE_FOLDER = "/content/drive/Shareddrives/Advanced ML Project Spring 2021/"
LABEL_VAR = 'partyWinning' # 'caseDisposition'
TEXT_VAR = 'text'

def get_oral_argument_text(term, docket_number):
    '''
    Given a term (year) and a docket_number, get the oral argument (text string)
    for a case from oyez.org.
    Inputs:
        term: string or int: 4-digit year
        docket_number: string or int: case docket number
    Returns:
        (str) full text of oral argument transcript
    '''
    # https://stackoverflow.com/questions/12965203/how-to-get-json-from-webpage-into-python-script
    base_url = f"https://api.oyez.org/cases/{term}/{docket_number}"
    with urllib.request.urlopen(base_url) as url:
        data = json.loads(url.read().decode())
    
    print("data['oral_argument_audio']:", data['oral_argument_audio'])
    if not data['oral_argument_audio']:
        print("this case has no oral arguments")
        print(term, docket_number)
        return None
    oral_argument_audio_url = data['oral_argument_audio'][0]['href']

    # https://api.oyez.org/case_media/oral_argument_audio/14026
    with urllib.request.urlopen(oral_argument_audio_url) as url:
        transcript_json = json.loads(url.read().decode())
    transcript_text = ""
    if not transcript_json['transcript']:
        print("transcript_json.keys()")
        print(transcript_json.keys())
        print("transcript_json['transcript']")
        print(transcript_json['transcript'])
        print("this case has no transcript")
        print(term, docket_number)
        return None
    for section in transcript_json['transcript']['sections']:
        section_blocks = [quote["text_blocks"] for quote in section['turns']]
        for list_of_blocks in section_blocks:
            for block in list_of_blocks:
                transcript_text += " " + block["text"]
    return transcript_text

In [None]:
def get_scotus_oral_arguments(year):
    '''
    Web-scrapes all oral arguments for a given year from oyez.org.
    In addition to returning the list of dictionaries of cases, it is outputted
    to a JSON in the scraped_data folder.
    Inputs:
        year: int or string: 4-digit year

    Returns:
        list of dictionaries of cases, of the form:
        [{"year": year,
        "docket_number": docket_number,
        "text": text},
        ...]
    '''
    scraped_data_folder = SHARED_DRIVE_BASE_FOLDER + "scraped_data/"
    if f"scotus_oral_arguments_{year}.json" in os.listdir(scraped_data_folder):
        print("year already scraped, loading from json from scraped_data folder")
        with open(scraped_data_folder + f"scotus_oral_arguments_{year}.json") as fp:
            f = json.load(fp)
        return f

    print("year not yet scraped, currently scraping (takes several minutes)")
    year_url = f"https://api.oyez.org/cases?per_page=0&filter=term:{year}"

    with urllib.request.urlopen(year_url) as url:
        year_data = json.loads(url.read().decode())

    cases = []
    for case in year_data:
        docket_number = case['docket_number']
        text = get_oral_argument_text(year, docket_number)
        docket_dict = {"year": year,
                    "docket_number": docket_number,
                    "text": text}
        cases.append(docket_dict)

    # Write data to JSON for future use
    with open(scraped_data_folder + f"scotus_oral_arguments_{year}.json", 'w') as f:
        json.dump(cases, f)

    return cases

In [None]:
def merge_oral_argument_text_with_outcome_labels(oral_argument_list):
    '''
    Merge oral argument text with a case outcome variable, joining on docket
    number. Does an inner join, but displays the number of oral arguments that
    don't match to an outcome (see left_only) for logging purposes.
    Inputs:
        oral_argument_list: list of dictionaries of cases, of the form:
            [{"year": year,
            "docket_number": docket_number,
            "text": text},
            ...]
    Outputs:
        (Pandas df): merged dataframe of oral arguments and outcome variable
    '''
    decision_by_docket = pd.read_csv(SHARED_DRIVE_BASE_FOLDER + "raw_data/SCDB_2020_01_caseCentered_Docket.csv",
                                engine='python')
    decision_by_docket = decision_by_docket.astype({'partyWinning': float})
    oral_argument_df = pd.DataFrame(oral_argument_list)

    # Drop cases with missing text
    oral_argument_df = oral_argument_df[~oral_argument_df[TEXT_VAR].isna()]
    # Drop duplicate cases, keeping the first (currently just 1 2007 case)
    oral_argument_df.drop_duplicates(subset=['docket_number'], inplace=True)
    merge_attempt = pd.merge(oral_argument_df, decision_by_docket,
                             how='left', indicator=True,
                             validate='1:m',
                             left_on='docket_number', right_on='docket')
    print("number of oral arguments that matched:")
    print(merge_attempt['_merge'].value_counts(dropna=False))
    merge_attempt = pd.merge(oral_argument_df, decision_by_docket,
                             how='inner',
                             validate='1:m',
                             left_on='docket_number', right_on='docket')
    
    return merge_attempt

In [None]:
def convert_merged_df_to_tuple_list(merged_df, label_var, text_var):
    '''
    Convert dataframe of oral arguments merged with outcome variable to a list
    of tuples, for use in NLP modeling.

    Inputs:
        merged_df: (Pandas df) oral arguments merged with outcome variable
        label_var: (str) name of outcome/label variable in merged_df
        text_var: (str) name of text variable in merged_df (usually "text")
    Returns:
        list of tuples of form [(label, text),
                                ...]
    '''

    # https://stackoverflow.com/questions/9758450/pandas-convert-dataframe-to-array-of-tuples
    tuple_list = list(merged_df[[label_var, text_var]].itertuples(index=False, name=None))
    return tuple_list

In [None]:
def balance(data, target):
    '''
    Downsamples binary classification data to align majority class with 
    minority class.

    Inputs:
        data: Pandas dataframe with balanced outcomes
        target: (str) name of target/outcome variable
    Returns:
        Pandas dataframe with balanced outcomes
    '''
    # counts = data.partyWinning.value_counts().reset_index().rename(
    counts = data[target].value_counts().reset_index().rename(
        columns={'index': target, target: 'count'})
    
    count = max(counts['count'])
    print("count of majority party")
    print(count)
    party = int(counts[counts['count'] == count][target])
    print(party)
    print(count)
    base = data[data[target] == party]
    minority_party = data[data[target] != party]
    sample = minority_party.iloc[np.random.randint(0, len(minority_party), size=count)]
    print("sample.head()")
    print(sample.head())
    new_data = pd.concat([base, sample])
    print(new_data[target].value_counts())
    return new_data

In [None]:
def get_minority_party(data, target):
    '''
    Downsamples binary classification data to align majority class with 
    minority class.
    Inputs:
        data: Pandas dataframe with balanced outcomes
        target: (str) name of target/outcome variable
    Returns:
        Pandas dataframe with balanced outcomes (downsampling)
    '''

    counts = data[target].value_counts().reset_index().rename(
        columns={'index': target, target: 'count'})
    
    count = max(counts['count'])
    print("count of majority party")
    print(count)
    party = int(counts[counts['count'] == count][target])
    print(party)
    print(count)
    base = data[data[target] == party]
    minority_party = data[data[target] != party]
    sample = minority_party.iloc[np.random.randint(0, len(minority_party), size=count)]
    print("sample.head()")
    print(sample.head())
    new_data = pd.concat([base, sample])
    # print(new_data[target].value_counts())
    # return new_data
    return sample


In [None]:
def preprocess_data(start_year, end_year, label_var, text_var, split=False,
                    balance_outcomes=False):
    '''
    Run all data extraction & preprocessing steps, saving output to a JSON
    file for future use.
    Inputs:
        start_year, end_year: ints of starting year and ending year of requested
            court data
        label_var: (str) name of column containing outcome/label variable
        text_var: (str) name of column containing text variable
        split: (bool) whether to train-test-validate split the data
        balance_outcomes: (bool) whether to balance the data on the label_var
    Returns:
        Pandas dataframe with columns label_var, text_var, 'docket',
            'docket_number', and 'year'
    '''
    oral_arguments_list = []
    for year in range(start_year, end_year+1):
        print("NOW GETTING YEAR", year)
        oral_arguments_year = get_scotus_oral_arguments(year)
        print("year", year, "has length", len(oral_arguments_year))
        oral_arguments_list += oral_arguments_year

    print("len(oral_arguments_list)")
    print(len(oral_arguments_list))
    merged_df = merge_oral_argument_text_with_outcome_labels(oral_arguments_list)
    print("merged_df.shape")
    print(merged_df.shape)
    # Make sure that there are no duplicate docket_numbers (that cases aren't duplicated when merging)
    merged_df['dups'] = merged_df['docket_number'].duplicated()
    assert merged_df['dups'].all() == False

    tuple_list = convert_merged_df_to_tuple_list(merged_df, LABEL_VAR, TEXT_VAR)

    # Write tuple list to JSON for future use
    with open(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/tuples_{start_year}-{end_year}.json", 'w') as f:
        json.dump(tuple_list, f)
    intermediate_df = merged_df[[label_var, text_var, 'docket', 'docket_number', 'year']]
    final_df = merged_df[[label_var, text_var]]
    final_df[label_var] = final_df[label_var].astype(int)

    # Strip punctuation (https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string)
    final_df[text_var] = final_df[text_var].apply(lambda x: x.lower().translate(
        str.maketrans('', '', string.punctuation)))

    final_df = final_df[[label_var, text_var]]
    final_df = final_df.astype({'partyWinning': float})
    if split:
        # https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
        train, validate, test = np.split(final_df.sample(frac=1, random_state=42),
                                     [int(.6*len(final_df)), int(.8*len(final_df))])
        if balance:
            train_balanced = balance(train, LABEL_VAR)
            train_balanced.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/train_balanced_{start_year}-{end_year}.csv",
                                                            index=True)
            # minority_party_train = get_minority_party(train, LABEL_VAR)
            # minority_party_train.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/minority_party_train_{start_year}-{end_year}.csv",
            #                                                 index=True)
            validate_balanced = balance(validate, LABEL_VAR)
            validate_balanced.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/validate_balanced_{start_year}-{end_year}.csv",
                                                            index=True)
            # minority_party_validate = get_minority_party(validate, LABEL_VAR)
            # minority_party_validate.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/minority_party_validate_{start_year}-{end_year}.csv",
            #                                                 index=True)
            test_balanced = balance(test, LABEL_VAR)
            test_balanced.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/test_balanced_{start_year}-{end_year}.csv",
                                                            index=True)
            # minority_party = get_minority_party(test, LABEL_VAR)
            # minority_party.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/minority_party_test_{start_year}-{end_year}.csv",
            #                                                 index=True)

        else:
            train.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/train_{start_year}-{end_year}.csv",
                                                            index=True)
            validate.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/validate_{start_year}-{end_year}.csv",
                                                            index=True)
            test.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/test_{start_year}-{end_year}.csv",
                                                            index=True)
    else:
        if balance:
            final_balanced = balance(final_df, LABEL_VAR)
            final_balanced.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/data_balanced_{start_year}-{end_year}.csv",
                                                            index=True)
        else:
            final_df.to_csv(SHARED_DRIVE_BASE_FOLDER + f"intermediate_data/data_{start_year}-{end_year}.csv",
                                                            index=True)
    return intermediate_df

**Run pre-processing for years 2001-2019, outputting files to local folder**

In [None]:
test_multiple_years = preprocess_data(2001, 2019, LABEL_VAR, TEXT_VAR,
                                      split=False, balance_outcomes=True)
test_multiple_years

NOW GETTING YEAR 2001
year already scraped, loading from json from scraped_data folder
year 2001 has length 84
NOW GETTING YEAR 2002
year already scraped, loading from json from scraped_data folder
year 2002 has length 85
NOW GETTING YEAR 2003
year already scraped, loading from json from scraped_data folder
year 2003 has length 81
NOW GETTING YEAR 2004
year already scraped, loading from json from scraped_data folder
year 2004 has length 80
NOW GETTING YEAR 2005
year already scraped, loading from json from scraped_data folder
year 2005 has length 89
NOW GETTING YEAR 2006
year already scraped, loading from json from scraped_data folder
year 2006 has length 77
NOW GETTING YEAR 2007
year already scraped, loading from json from scraped_data folder
year 2007 has length 75
NOW GETTING YEAR 2008
year already scraped, loading from json from scraped_data folder
year 2008 has length 83
NOW GETTING YEAR 2009
year already scraped, loading from json from scraped_data folder
year 2009 has length 87
N

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


count of majority party
873
1
873
sample.head()
      partyWinning                                               text
1031           0.0   well hear argument first this morning in case...
1283           0.0   well hear argument first this morning in case...
165            0.0   well hear argument now in no 02891 the centra...
917            0.0   we will hear argument first this morning in c...
857            0.0   well hear argument first this morning in case...
1.0    873
0.0    871
2.0      2
Name: partyWinning, dtype: int64


Unnamed: 0,partyWinning,text,docket,docket_number,year
0,0.0,We'll hear argument first this morning in Num...,00-507,00-507,2001
1,1.0,"We'll hear argument next in Number 00-1853, A...",00-1853,00-1853,2001
2,1.0,We'll hear argument now in Number oh oh ten e...,00-1089,00-1089,2001
3,1.0,We'll hear argument next in Number oh oh twel...,00-1250,00-1250,2001
4,1.0,"Mr. Chief Justice, and may it please the Cour...",00-927,00-927,2001
...,...,...,...,...,...
1317,1.0,We'll hear argument next in Case Number 19-26...,19-267,19-267,2019
1318,0.0,"We'll hear argument next in Case 19-631, Will...",19-631,19-631,2019
1319,1.0,We will hear argument first this morning in C...,19-431,19-431,2019
1320,0.0,We will hear argument first this morning in C...,19-465,19-465,2019


**Look at cases labeled as outcome = 2**

In [None]:
test_multiple_years['dups'] = test_multiple_years.duplicated(subset=['docket'], keep=False)
test_multiple_years[test_multiple_years['dups']]
# test_multiple_years.columns
test_multiple_years[test_multiple_years['partyWinning'] == 2]

Unnamed: 0,partyWinning,text,docket,docket_number,year,dups
10,2.0,We'll hear argument first this morning in Num...,00-878,00-878,2001,False
1265,2.0,We'll hear argument first this morning in Cas...,18-280,18-280,2019,False
1279,2.0,"We'll hear argument next in Case 18-1165, the...",18-1165,18-1165,2019,False


**Tables for visualizations**

In [None]:
df_for_viz = preprocess_data(2001, 2019, LABEL_VAR, TEXT_VAR,
                                      split=False, balance_outcomes=False)

NOW GETTING YEAR 2001
year already scraped, loading from json from scraped_data folder
year 2001 has length 84
NOW GETTING YEAR 2002
year already scraped, loading from json from scraped_data folder
year 2002 has length 85
NOW GETTING YEAR 2003
year already scraped, loading from json from scraped_data folder
year 2003 has length 81
NOW GETTING YEAR 2004
year already scraped, loading from json from scraped_data folder
year 2004 has length 80
NOW GETTING YEAR 2005
year already scraped, loading from json from scraped_data folder
year 2005 has length 89
NOW GETTING YEAR 2006
year already scraped, loading from json from scraped_data folder
year 2006 has length 77
NOW GETTING YEAR 2007
year already scraped, loading from json from scraped_data folder
year 2007 has length 75
NOW GETTING YEAR 2008
year already scraped, loading from json from scraped_data folder
year 2008 has length 83
NOW GETTING YEAR 2009
year already scraped, loading from json from scraped_data folder
year 2009 has length 87
N

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


count of majority party
873
1
873
sample.head()
      partyWinning                                               text
336            0.0   well hear argument first this morning in mart...
436            0.0   well hear argument next in case 065306 bowles...
344            0.0   well hear argument next in clark versus arizo...
490            0.0   well hear argument first this morning in case...
1028           0.0   well hear argument first this morning in case...
1.0    873
0.0    866
2.0      7
Name: partyWinning, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# Count of cases by year and outcome variable
df_for_viz.groupby(['year', 'partyWinning']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,docket,docket_number
year,partyWinning,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2001,0.0,24,24,24
2001,1.0,52,52,52
2001,2.0,1,1,1
2002,0.0,26,26,26
2002,1.0,50,50,50
2003,0.0,20,20,20
2003,1.0,54,54,54
2004,0.0,23,23,23
2004,1.0,50,50,50
2005,0.0,25,25,25


In [None]:
# Count of cases by year
counts_df= df_for_viz[['docket', 'year']].groupby(['year']).count()
counts_df.columns = ['Docket Count']
counts_df

Unnamed: 0_level_0,Docket Count
year,Unnamed: 1_level_1
2001,77
2002,76
2003,74
2004,73
2005,75
2006,71
2007,70
2008,77
2009,75
2010,77


In [None]:
# Count of cases by outcome variable
outcome_count_df = df_for_viz[['docket', 'partyWinning']].groupby(['partyWinning']).count()
outcome_count_df.columns=['Docket Count']
outcome_count_df

Unnamed: 0_level_0,Docket Count
partyWinning,Unnamed: 1_level_1
0.0,446
1.0,873
2.0,3
