In [139]:
####################################
# Author: James Labadorf
# Github: jlabadorf
# Email: jslabad+github@gmail.com
# License: CC-BY-2.0 (https://creativecommons.org/licenses/by/2.0/)
# Datasource: Free Law Project, Bulk Opinions (https://www.courtlistener.com/api/bulk-data/opinions/scotus.tar.gz)
# Description: This code takes the data downloaded from the datasources and combines them into a single database table.
##################################


import pandas as pd
import os 
import json
from bs4 import BeautifulSoup

def get_text(html): #Sometimes the plain text is not available but the HTML is. This function extracts the plain text from the HTML
    try:
        soup = BeautifulSoup(html, 'html.parser')
        text = soup.find_all(text=True)
        text
        fulltext = ""
        for line in text:
            fulltext += line
        return fulltext
    except:
        return None


import requests as r

def get_author(uri): #The author field is returned as a URI. This function pings the API and returns the human reable information
    try:
        auth = r.get(uri)
        auth.text

        auth = json.loads(auth.text)
        person_id = auth["id"]
        return (auth["slug"],person_id)
    except:
        return (None,None)


#First, I need to get the file paths for all the cases

###################### Get Files #######################
### Code block source: https://pythonguides.com/python-get-all-files-in-directory/
path = str(os.getcwd())+u"\\all_cases" 
list_of_files = []
for root, dirs, files in os.walk(path):
	for file in files:
		list_of_files.append(os.path.join(root,file))
#########################################################

#Secondly, I need to convert the jsons to a format I can utilize.

data = []

############## Open the JSONs ###########################
for file in list_of_files:
    try:
        with open(file) as opinion:
            o_dict = json.load(opinion)

#########################################################
#################### Extract data  ######################
        id = o_dict["id"]
        resource_uri = o_dict["resource_uri"]
        #author_tup = get_author(o_dict["author"]) # removed this option not to overwhelm the Free Law API
        #author = author_tup[0]
        try:
            author_id = o_dict["author"][-5:-1]
        except:
            author_id = None
        author_uri = o_dict["author"]
        joined_by = o_dict["joined_by"]
        per_curiam = o_dict["per_curiam"]
        type = o_dict["type"]
        sha1 = o_dict["sha1"]
        page_count = o_dict["page_count"]
        download_url = o_dict["download_url"]
        date_created = o_dict["date_created"]
        date_modified = o_dict["date_modified"]
        plain_text = o_dict["plain_text"]
        html = o_dict["html"]
        if o_dict["plain_text"] =='':
            plain_text = get_text(html)
        data.append([id,resource_uri,author_id,author_uri,author,joined_by,per_curiam,type,sha1,page_count,download_url,date_created,date_modified,plain_text,html])
    except:
        pass

###########################################################



In [138]:
# Lastly, we need to package the data and send it to the database

columns = ["id","resource_uri","author_id","author_uri","author","joined_by","per_curiam","type","sha1","page_count","download_url","date_created","date_modified","plain_text","html_text"]
df = pd.DataFrame(data,columns=columns)

### Code block source: https://www.geeksforgeeks.org/python-sqlite-creating-a-new-database/

import sqlite3
  
# filename to form database

file = "Sqlite3.db"
df = df.applymap(str)
try:
  conn = sqlite3.connect(file)
  print("Database Sqlite3.db formed.")
except:
  print("Database Sqlite3.db not formed.")
#### end code block #####


df.to_sql(name = "opinion_table",con=conn, if_exists="replace")

Database Sqlite3.db formed.
