In [1]:
import json
from datetime import date
import traceback

import requests

import os
import pandas as pd

# @sleep_and_retry
# @limits(calls=10, period=10)  # no more than 1 call per second
def get_http_json(url):
    print(f"Getting {url}")
    response = requests.get(url)
    parsed = response.json()
    return parsed

def get_case(term, docket):
    """Get the info of the case and fetch all
    transcripts that the info links to"""
    url = f"https://api.oyez.org/cases/{term}/{docket}"
    docket_data = get_http_json(url)

    if not (
        "oral_argument_audio" in docket_data and docket_data["oral_argument_audio"]
    ):
        # no oral arguments for this case yet
        # fail so we will try again later
        print(f"No oral arguments for docket {docket}")
        return (docket_data, [])

    oral_argument_audio = docket_data["oral_argument_audio"]
    transcripts = []
    for link in oral_argument_audio:
        t = get_http_json(link["href"])
        transcripts.append(t)

    return docket_data, transcripts

def getAudio(transcripts):
    num_files = len(transcripts)
    audio_list = []
    for t in transcripts:
        media_dicts = t['media_file']
        if media_dicts[0] is not None: #handle weird cases
            #just incase theres more than one, there shouldnt be but they re in a weird list
            for media_dict in media_dicts:
                audio_list.append(media_dict['href'])
    return [num_files,audio_list]

#gets transcript along with metadata
def getTranscript(transcripts):
    transcript_list = []
    speaker_list = []
    speaker_type_list = []
    time_list = []
    
    #parse through many levels of json file
    for t in transcripts:
        sections = t['transcript']['sections']
        for section in sections:
            turns = section['turns']

            for turn in turns:
                
                #collect speaker
                try:
                    speaker = turn['speaker']['name']
                except:
                    speaker = '<UNK>'
                speaker_list.append(speaker)   
                
                #collect speaker type
                try:
                    roles = turn['speaker']['roles']

                    if isinstance(turn['speaker']['roles'], list):
                        roles = turn['speaker']['roles']
                        multiple_roles = []
                        for role in roles:
                            multiple_roles.append(role['type'])
                        speaker_type_list.append(multiple_roles)

                    else:
                        speaker_type_list.append(['Other']) #Other is most likely Lawyer
                except:
                    speaker_type_list.append(['Other'])
                
                
                #collect text and time
                texts = turn['text_blocks']
                texts_out = []
                times_out = []
                for text in texts:
                    texts_out.append(text['text'])
                    times_out.append((text['start'],text['stop']))
                
                transcript_list.append(texts_out)
                time_list.append(times_out)

    return transcript_list, speaker_list, speaker_type_list, time_list


In [4]:
with open(os.getcwd() + '/case_summaries.json') as f:
        data = json.load(f)

case_summaries = pd.DataFrame(data)
case_summaries = case_summaries[['term', 'docket_number']]

case_summaries_filtered = case_summaries[(case_summaries['term']>='2013') & (case_summaries['term']<'2019')]

In [5]:
case_summaries_filtered.shape

(440, 2)

In [6]:
data = {}

for term, docket_number in case_summaries_filtered.itertuples(index=False):
    docket_data, transcripts = get_case(term, docket_number)
    data[docket_number] = transcripts

Getting https://api.oyez.org/cases/2013/12-1038
Getting https://api.oyez.org/case_media/oral_argument_audio/22425
Getting https://api.oyez.org/cases/2013/12-682
Getting https://api.oyez.org/case_media/oral_argument_audio/22178
Getting https://api.oyez.org/cases/2013/12-515
Getting https://api.oyez.org/case_media/oral_argument_audio/21683
Getting https://api.oyez.org/cases/2013/12-1036
Getting https://api.oyez.org/case_media/oral_argument_audio/21916
Getting https://api.oyez.org/cases/2013/12-138
Getting https://api.oyez.org/case_media/oral_argument_audio/22866
Getting https://api.oyez.org/cases/2013/13-354
Getting https://api.oyez.org/case_media/oral_argument_audio/23268
Getting https://api.oyez.org/cases/2013/12-1217
No oral arguments for docket 12-1217
Getting https://api.oyez.org/cases/2013/11-965
Getting https://api.oyez.org/case_media/oral_argument_audio/21578
Getting https://api.oyez.org/cases/2013/12-9490
Getting https://api.oyez.org/case_media/oral_argument_audio/22239
Getting 

Getting https://api.oyez.org/cases/2014/13-1211
Getting https://api.oyez.org/case_media/oral_argument_audio/23257
Getting https://api.oyez.org/cases/2014/13-1314
Getting https://api.oyez.org/case_media/oral_argument_audio/23270
Getting https://api.oyez.org/cases/2014/13-1318
No oral arguments for docket 13-1318
Getting https://api.oyez.org/cases/2014/13-1333
Getting https://api.oyez.org/case_media/oral_argument_audio/23313
Getting https://api.oyez.org/cases/2014/13-1352
Getting https://api.oyez.org/case_media/oral_argument_audio/23279
Getting https://api.oyez.org/cases/2014/13-1371
Getting https://api.oyez.org/case_media/oral_argument_audio/23285
Getting https://api.oyez.org/cases/2014/13-1402
Getting https://api.oyez.org/case_media/oral_argument_audio/23215
Getting https://api.oyez.org/cases/2014/13-1412
Getting https://api.oyez.org/case_media/oral_argument_audio/23735
Getting https://api.oyez.org/cases/2014/13-1428
Getting https://api.oyez.org/case_media/oral_argument_audio/23203
Get

Getting https://api.oyez.org/cases/2013/12-5196
Getting https://api.oyez.org/case_media/oral_argument_audio/23696
Getting https://api.oyez.org/cases/2013/12-761
Getting https://api.oyez.org/case_media/oral_argument_audio/23517
Getting https://api.oyez.org/cases/2013/12-786
Getting https://api.oyez.org/case_media/oral_argument_audio/23538
Getting https://api.oyez.org/cases/2013/12-842
Getting https://api.oyez.org/case_media/oral_argument_audio/23520
Getting https://api.oyez.org/cases/2013/12-9012
Getting https://api.oyez.org/case_media/oral_argument_audio/23628
Getting https://api.oyez.org/cases/2013/13-132
Getting https://api.oyez.org/case_media/oral_argument_audio/23523
Getting https://api.oyez.org/case_media/oral_argument_audio/23588
Getting https://api.oyez.org/cases/2013/13-193
Getting https://api.oyez.org/case_media/oral_argument_audio/23536
Getting https://api.oyez.org/cases/2013/13-298
Getting https://api.oyez.org/case_media/oral_argument_audio/23529
Getting https://api.oyez.org

Getting https://api.oyez.org/cases/2015/15-375
Getting https://api.oyez.org/case_media/oral_argument_audio/24137
Getting https://api.oyez.org/cases/2015/15-458
Getting https://api.oyez.org/case_media/oral_argument_audio/24138
Getting https://api.oyez.org/cases/2015/15-674
Getting https://api.oyez.org/case_media/oral_argument_audio/24134
Getting https://api.oyez.org/cases/2016/15-457
Getting https://api.oyez.org/case_media/oral_argument_audio/24536
Getting https://api.oyez.org/cases/2015/15-415
Getting https://api.oyez.org/case_media/oral_argument_audio/24131
Getting https://api.oyez.org/cases/2015/15-446
Getting https://api.oyez.org/case_media/oral_argument_audio/24136
Getting https://api.oyez.org/cases/2015/15-474
Getting https://api.oyez.org/case_media/oral_argument_audio/24139
Getting https://api.oyez.org/cases/2016/15-577
Getting https://api.oyez.org/case_media/oral_argument_audio/24273
Getting https://api.oyez.org/cases/2015/15-6092
Getting https://api.oyez.org/case_media/oral_arg

Getting https://api.oyez.org/cases/2016/16-309
Getting https://api.oyez.org/case_media/oral_argument_audio/24284
Getting https://api.oyez.org/cases/2017/16-285
Getting https://api.oyez.org/case_media/oral_argument_audio/24571
Getting https://api.oyez.org/cases/2016/16-5294
Getting https://api.oyez.org/case_media/oral_argument_audio/24281
Getting https://api.oyez.org/cases/2016/16-405
Getting https://api.oyez.org/case_media/oral_argument_audio/24280
Getting https://api.oyez.org/cases/2016/16-6219
Getting https://api.oyez.org/case_media/oral_argument_audio/24283
Getting https://api.oyez.org/cases/2017/15-1458
No oral arguments for docket 15-1458
Getting https://api.oyez.org/cases/2016/16-399
Getting https://api.oyez.org/case_media/oral_argument_audio/24278
Getting https://api.oyez.org/cases/2016/16-349
Getting https://api.oyez.org/case_media/oral_argument_audio/24275
Getting https://api.oyez.org/cases/2016/16-373
Getting https://api.oyez.org/case_media/oral_argument_audio/24279
Getting h

Getting https://api.oyez.org/cases/2017/17-530
Getting https://api.oyez.org/case_media/oral_argument_audio/24611
Getting https://api.oyez.org/cases/2018/17-7505
Getting https://api.oyez.org/case_media/oral_argument_audio/24616
Getting https://api.oyez.org/cases/2018/18-281
Getting https://api.oyez.org/case_media/oral_argument_audio/24808
Getting https://api.oyez.org/cases/2018/17-9572
Getting https://api.oyez.org/case_media/oral_argument_audio/24811
Getting https://api.oyez.org/cases/2018/17-6086
Getting https://api.oyez.org/case_media/oral_argument_audio/24617
Getting https://api.oyez.org/cases/2018/18-481
Getting https://api.oyez.org/case_media/oral_argument_audio/24832
Getting https://api.oyez.org/cases/2018/17-773
Getting https://api.oyez.org/case_media/oral_argument_audio/24639
Getting https://api.oyez.org/cases/2018/17-1184
Getting https://api.oyez.org/case_media/oral_argument_audio/24783
Getting https://api.oyez.org/cases/2018/18-15
Getting https://api.oyez.org/case_media/oral_a

In [7]:
audio_data = {}

dock1 = []
dock2 = []
for docket, transcript in data.items():
    if bool(data[docket]) and type(data[docket][0]['transcript']) == dict:
        if getAudio(data[docket])[0] == 1:
            temp = getAudio(data[docket])[1]
            audio_data[docket] = temp[0] #s3 link
        else:
            dock2.append(docket)
    else:
        dock1.append(docket)


mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3
mp3


In [8]:
len(dock1)

56

In [9]:
len(dock2)

7

In [10]:
len(audio_data.keys())

375

## Saving

In [11]:
# Create .shell script for HPC terminal
file1 = open("mp3_curl_cmds.sh","w") 

L = ["#!/bin/bash \n",
     "#SBATCH --nodes=1 \n",
     "#SBATCH --ntasks-per-node=1 \n",
     "#SBATCH --cpus-per-task=1 \n",
     "#SBATCH --time=5:00:00 \n",
     "#SBATCH --mem=2GB \n",
     "#SBATCH --job-name=get_oyez_mp3s \n",
     "\n"]  
file1.writelines(L) 

for docket, s3_link in audio_data.items():
    file1.write(f'curl -L {s3_link} -o {docket}.mp3 \n')

file1.close() 

print("mp3_curl_cmds.sh created.")

mp3_curl_cmds.sh created.


In [12]:
mp3_meta_data = {}

# Using transcript_data_clean.keys() to get the list of dockets from 2011 - 2020 that:
#   1. All have transcripts 
#   2. All have just 1 mp3 file 
for docket in audio_data.keys():
    print(docket)
    transcript_list, speaker_list, speaker_type_list, time_list = getTranscript(data[docket])
    mp3_meta_data[docket] = transcript_list, speaker_list, speaker_type_list, time_list

with open('oyez_metadata.json', 'w+') as f:
    # this would place the entire output on one line
    # use json.dump(lista_items, f, indent=4) to "pretty-print" with four spaces per indent
    json.dump(mp3_meta_data, f)

print("oyez_metadata.json file created.")

12-1038
12-682
12-515
12-1036
12-138
13-354
11-965
12-9490
12-7822
12-574
12-1408
12-1493
12-751
12-609
13-316
12-1168
12-815
12-1184
12-873
11-681
12-1315
12-315
13-115
12-414
12-1128
12-929
12-872
12-794
12-79
12-1281
12-1200
12-1117
12-562
12-99
12-464
12-696
12-536
12-1163
12-1371
12-1173
12-895
13-317
12-1146
12-3
12-1182
12-820
12-8561
12-7515
13-483
12-10882
12-462
12-417
12-729
12-930
12-158
12-1226
12-1497
13-1010
13-1019
13-1032
13-1034
13-1041
13-1074
13-1080
13-1174
13-1175
13-1211
13-1314
13-1333
13-1352
13-1371
13-1402
13-1428
13-1487
13-1499
13-271
13-352
13-433
13-435
13-485
13-502
13-517
13-550
13-553
13-604
13-628
13-6827
13-684
13-719
13-7451
13-854
13-894
13-895
13-9026
13-935
13-975
13-983
13-9972
14-103
14-114
14-144
14-15
14-185
14-6368
14-7955
14-86
14-7505
14-280
14-462
14-723
13-1339
14-613
14-857
14-940
12-5196
12-761
12-786
12-842
12-9012
13-193
13-298
13-339
13-369
13-461
14-419
14-990
13-1496
14-844
14-840
14-981
14-8349
14-181
14-915
14-232
14-916
14-8358