## Joining Script
This script brings together data from different sources into one file
Set up the the working directory as FILE_PATH, all subsequent paths are relative to this directory.

In [0]:
import pandas as pd
import numpy as np
import os
import re

import settings
FILE_PATH = settings.FILE_PATH

### Import Congressional directory

In [0]:
directory = pd.read_csv(FILE_PATH + "Directory/All MoCs 105-114.csv",  delimiter=";")

### Generate Constituency Table
First, import state names

In [0]:
state_names = pd.read_csv(FILE_PATH + "help files/state names.csv", delimiter=";")
state_and_district = pd.DataFrame({"district": directory['district'], "state.abbreviation": directory['state.abbreviation']}).drop_duplicates()

constituency = pd.merge(state_names[["state.abbreviation", "state.name"]], state_and_district, left_on = ["state.abbreviation"], right_on = ["state.abbreviation"], how = "inner" )

territory_values = ["AS", "DC", "GU", "MP", "VI"]
territories = np.isin(constituency['state.abbreviation'], territory_values)
constituency['district'] = np.where(territories == True, "TERRITORY", constituency['district'])
constituency['district'] = np.where(constituency['district'] == 'nan',"STATE", constituency['district'])

constituency = constituency.sort_values("state.abbreviation")
constituency['constituency.id'] = range(1,len(constituency) + 1)
constituency.shape

(514, 4)

##### Merge Constituency with Directory

In [0]:
territory_values = ["AS", "DC", "GU", "MP", "VI"]
territories = np.isin(directory['state.abbreviation'], territory_values)
directory['district'] = np.where(territories == True, "TERRITORY", directory['district'])
directory['district'] = np.where(directory['district'] == 'nan',"STATE", directory['district'])

constituency["helper"] = constituency['state.abbreviation'].map(str) + "-" + constituency['district']
directory["helper"] = directory['state.abbreviation'].map(str) + "-" + directory['district']

directory = pd.merge(directory, constituency[['constituency.id', 'helper']], on=["helper"], right_index=True)
del directory["helper"]
directory.shape

(5311, 15)

### Generate zip codes table

In [0]:
def split_zip_codes(df):
    rows = []
    for index, row in df.iterrows():
        zips = row["zip"].split(",")
        for zp in zips:
            if len(zp) < 1 or zp == 'nan': continue
            rows.append([row["congress"], row['state.abbreviation'], row["district"], zp ])
    return pd.DataFrame(rows, columns = ['congress','state.abbreviation',"district", "zip"])   

In [0]:
zip_codes = pd.DataFrame(
    {
    "congress": directory['congress'],
    "state.abbreviation": directory['state.abbreviation'],
    "district": directory['district'],
    "zip": directory["zip"].astype(str)
    }
)

zip_codes.duplicated()
zip_codes = zip_codes.drop_duplicates()

territory_values = ["AS", "DC", "GU", "MP", "VI"]
territories = np.isin(zip_codes['state.abbreviation'], territory_values)
zip_codes['district'] = np.where(territories == True, "TERRITORY", zip_codes['district'])
zip_codes['district'] = np.where(zip_codes['district'] == 'nan',"STATE", zip_codes['district'])

#split up zip column
zip_codes = split_zip_codes(zip_codes)
zip_codes.shape

zip_codes.replace('nan', np.nan, inplace=True)
zip_codes.dropna(inplace=True)

#merge with constituency 
zip_codes['helper'] = zip_codes['state.abbreviation'].map(str) + "-" + zip_codes['district']
zip_codes = pd.merge(zip_codes, constituency[['constituency.id', 'helper']], on=["helper"], right_index=True)
zip_codes = zip_codes[["congress", "zip","constituency.id"]]
zip_codes.drop_duplicates()
print(zip_codes.shape)
zip_codes[0:5]    


(433264, 3)


Unnamed: 0,congress,zip,constituency.id
0,105,801,485
1,105,802,485
2,105,803,485
3,105,804,485
4,105,820,485


### Constituency Characteristics
This table consists of the population density per district/state

In [0]:
path = "external data sets/ACS - MY Congressional District/Congressional District size and population.csv"
constituency_characteristics = pd.read_csv(FILE_PATH + path,  delimiter=";")

constituency_characteristics["helper"] = constituency_characteristics['state.abbreviation'].map(str) + "-" + constituency_characteristics['district'].apply(lambda x: str(float(x)) if x != "TERRITORY" else x )
constituency_characteristics = pd.merge(constituency_characteristics, constituency[['constituency.id', 'helper']], on=["helper"], right_index=True)


#quantiles
all_congresses = np.unique(constituency_characteristics['congress'])
constituency_characteristics["density.quintile"] = np.nan
rows = []
for congress in all_congresses:
    subset = constituency_characteristics.loc[constituency_characteristics['congress'] == congress]
    quintiles = subset[['congress', 'population.per.sqm']].quantile([.2,.4,.6,.8])
    quintiles = quintiles["population.per.sqm"].tolist()

    constituency_characteristics.loc[(constituency_characteristics['congress'] == congress) & (constituency_characteristics['population.per.sqm'] <= quintiles[0]), 'density.quintile'] = 1
    constituency_characteristics.loc[(constituency_characteristics['congress'] == congress) & (constituency_characteristics['population.per.sqm'] > quintiles[0]) & (constituency_characteristics['population.per.sqm'] <= quintiles[1]), 'density.quintile'] = 2
    constituency_characteristics.loc[(constituency_characteristics['congress'] == congress) & (constituency_characteristics['population.per.sqm'] > quintiles[1]) & (constituency_characteristics['population.per.sqm'] <= quintiles[2]), 'density.quintile'] = 3
    constituency_characteristics.loc[(constituency_characteristics['congress'] == congress) & (constituency_characteristics['population.per.sqm'] > quintiles[2]) & (constituency_characteristics['population.per.sqm'] <= quintiles[3]), 'density.quintile'] = 4
    constituency_characteristics.loc[(constituency_characteristics['congress'] == congress) & (constituency_characteristics['population.per.sqm'] > quintiles[3]), 'density.quintile'] = 5

constituency_characteristics = constituency_characteristics[["constituency.id","congress","census.district.id","land.sqm","population.total","population.per.sqm","density.quintile"]]
print(constituency_characteristics.shape)
constituency_characteristics.head()

(1744, 7)


Unnamed: 0,constituency.id,congress,census.district.id,land.sqm,population.total,population.per.sqm,density.quintile
0,1,111,200,570640.95,713985,1.251198,1.0
1,1,112,200,570838.98,722718,1.266063,1.0
2,1,113,200,570640.95,735132,1.288257,1.0
3,1,114,200,570600.852,738432,1.294131,1.0
4,8,111,105,4466.522,719906,161.178205,2.0


### All hearings

##### Bag of names

In [0]:
bag_of_names = np.unique(directory["name.fnf"].str.lower().tolist()).tolist()
new_bag_of_names = []
for name in bag_of_names:
    new_bag_of_names.extend(name.split(" "))
bag_of_names = list(set(new_bag_of_names))
print(len(bag_of_names))

1474


#### Import json files and add them to the hearings dataframe

In [0]:
path = FILE_PATH + 'hearings & MODS/json'
files_ = []
for i, filename in enumerate(os.listdir(path)):
    if filename.endswith('.json') and not "_MODS" in filename:
        files_.append([filename, np.nan, np.nan])
all_hearings = pd.DataFrame(files_, columns = ["file.name", "members", "text"]).sort_values(by=['file.name']).reset_index()
all_hearings.index.names = ["file.id"]
del all_hearings['index']
print(all_hearings.shape)
print(all_hearings.head())

(1, 3)
                                                 file.name  members  text
file.id                                                                  
0        115th Congress (2017 - 2018)_Senate Hearings_C...      NaN   NaN


#### Initialize tables

In [0]:
hearing        = pd.DataFrame(columns = ["hearing.id", "committee.id", "subcommittee.id","hearing.title", "is.appropriation", "is.nomination", "date", "url", "file", "extent"])
committee      = pd.DataFrame(columns = ["committee.id", "subcommitee.id", "type", "committee.name", "subcommittee.name", "chamber", "congress.session", "committee.session", "help.id"])
related        = pd.DataFrame(columns = ["hearing.id", "related.item"])
bill           = pd.DataFrame(columns = ["hearing.id", "bill.type", "bill.number", "bill.congress"])
law            = pd.DataFrame(columns = ["hearing.id", "law.number", "law.congress"])
us_code        = pd.DataFrame(columns = ["hearing.id", "code"])
person         = pd.DataFrame(columns = ["person.id", "full.name", "first.name", "middle.name", "surname","honorific","gpo.id", "bio.guide.id"]).set_index("person.id")
attendance     = pd.DataFrame(columns = ["hearing.id", "person.id", "role"])
congressmember = pd.DataFrame(columns = ["person.id", "party", "chamber"])
witness        = pd.DataFrame(columns = ["person.id", "full.title", "job.title", "organization"])  
speech         = pd.DataFrame(columns = ["speech.id", "previous.speech.id", "subsequent.speech.id", "hearing.id", "statement.type", "conversation", "text"])  
speaker        = pd.DataFrame(columns = ["speech.id", "person.id", "surname"])

#### Loop through all the hearing files

In [0]:
person_id, speech_id, hearing_id = 0,0,0

for i, row in all_hearings.iterrows():

    filename = row['file.name']
    path =  FILE_PATH + 'hearings & MODS/json/' + filename
    path_to_mod = path.replace(".json", "_MODS.json")
    mods_raw = pd.read_json(path_to_mod)

    
    #COMMITTEE
    congcommitte = mods_raw['congcommittee'] if 'congcommittee' in mods_raw else None
    c_help_id = congcommitte['attrs'][0]['authorityid'] + "-" + congcommitte['attrs'][0]['congress']   
    committee_sub_name = None
    
    if len(congcommitte['values']) == 1:
        if  'subcommittee' in mods_raw and (len(mods_raw['subcommittee']['values']) == 1):
            committee_name, committee_sub_name, _ = congcommitte['values'][0].split("\n\n")
            committee_sub_name = committee_sub_name.strip().replace("\n","" )
            committee_name = committee_name.strip().replace("\n","" )
        else:
            committee_name = congcommitte['values'][0].strip().replace("\n","")     
    else: 
        committee_name = (congcommitte['values'][0]).strip().replace("\n","" )

    if not c_help_id in committee["help.id"].values:
        committee = committee.append(pd.DataFrame({ 
            "committee.id" : 1 if i == 0 else committee['committee.id'].max() + 1,
            "subcommitee.id": 1 if i == 0 else committee['subcommitee.id'].max() + 1,
            "type": congcommitte['attrs'][0]['type'], 
            "committee.name": committee_name, 
            "subcommittee.name": committee_sub_name,
            "chamber": mods_raw['chamber']['values'],
            "congress.session": congcommitte['attrs'][0]['congress'] ,
            "committee.session":mods_raw['session']['values'], 
            "help.id": c_help_id
        }), ignore_index=True)

        
    #HEARING
    hearing_id += 1 
    hearing = hearing.append(pd.DataFrame({ 
        "hearing.id": hearing_id,
        "committee.id": int(committee.loc[committee['help.id'] == c_help_id]["committee.id"]),
        "subcommittee.id": int(committee.loc[committee['help.id'] == c_help_id]["subcommitee.id"]),
        "hearing.title": mods_raw['title']['values'][0],
        "date": mods_raw['helddate']['values'][0],
        "is.appropriation": mods_raw['isappropriation']['values'],
        "is.nomination": mods_raw['isnomination']['values'],
        "file": filename,
        "extent": mods_raw['extent']['values'] if ('extent' in mods_raw) else None,
        "url": mods_raw['url']['values'][0],
    }), ignore_index=True)
    

    #RELATED ITEM
    if "relateditem" in mods_raw and len(mods_raw["relateditem"]["values"][0]) > 0:
        clean_related_item = [x.replace("\n", "") for x in mods_raw["relateditem"]["values"] if x != "\n\n"]
        related = related.append(pd.DataFrame({ 
            "hearing.id": hearing_id,
            "related.item": clean_related_item,
        }), ignore_index=True)
      
    
    #BILLS
    if 'bill' in mods_raw and len(mods_raw["bill"]["attrs"]) > 0:
        for j in range(len(mods_raw["bill"]["attrs"])):
            bill = bill.append(pd.DataFrame({ 
                "hearing.id": hearing_id,
                "bill.type": mods_raw["bill"]["attrs"][j]["type"],
                "bill.number": mods_raw["bill"]["attrs"][j]["number"],
                "bill.congress": [mods_raw["bill"]["attrs"][j]["congress"]],
            }), ignore_index=True)   
     
    
    #LAW
    if 'law' in mods_raw and len(mods_raw["law"]["attrs"]) > 0:
        for j in range(len(mods_raw["law"]["attrs"])):
            law = law.append(pd.DataFrame({ 
                "hearing.id": hearing_id,
                "law.number": mods_raw["law"]["attrs"][j]["number"],
                "law.congress": [mods_raw["law"]["attrs"][j]["congress"]],
            }), ignore_index=True)  
            

    #US CODE
    if "partnumber" in mods_raw and len(mods_raw["partnumber"]) > 0:
        us_code = us_code.append(pd.DataFrame({
            "hearing.id": hearing_id,
            "code": mods_raw["partnumber"]['values']
        }), ignore_index=True) 

    #CONGRESSMEMBER, PERSON, ATTENDANCE 
    
    if ('congmember' in mods_raw and len(mods_raw['congmember']) > 0):
        congmember = mods_raw['congmember']
        all_hearings.loc[(all_hearings["file.name"] == filename), "members"] = "GOT 'EM"
        
        persons = congmember['values']
        for p in range(len(persons)):
            person_id += 1
            person_lp = {"person.id": int(person_id)}
            congress_member_lp = {}
            attendance_lp = {}
            
            with_of = congmember['values'][p].replace("\n", "$")
            non_of = re.sub("Judge |Dr. |Mr. |Mrs. |Ms. |Honorable |Hon. |, MD|, Jr.| Jr.", "", with_of).split("of")[0]
            non_of = re.sub("\\$" , "", non_of)
            non_of = re.sub(",\\s*,",",", non_of)
            non_of = re.sub("^,+|,+$","", non_of)
            non_of = non_of.strip().lower()
            
            if (" Jr." in with_of): person_lp['honorific'] = "Jr."
            if ("Hon.|Honorable" in with_of): person_lp['honorific'] = "Hon."
            if ("Judge" in with_of): person_lp['honorific'] = "Judge"
            if (", MD" in with_of): person_lp['honorific'] = "MD"
            if ("Dr. " in with_of): person_lp['honorific'] = "Dr."
                
            person_lp["full.name"] = non_of
            person_lp["first.name"] = non_of.split(" ")[0]
            person_lp['surname'] = non_of.split(" ")[len(non_of.split(" ")) - 1]
            if len(non_of.split(" ")) > 2:
                person_lp["middle.name"] = non_of.replace(person_lp["first.name"], "").replace(person_lp["surname"], "").strip()

            #Directory match
            directory_match = directory.loc[
                (directory["name"].str.lower() == person_lp["full.name"].lower()) & 
                (directory["congress"].astype(int) == int(congcommitte['attrs'][0]['congress']))
            ]
            
            if directory_match.shape[0]:
                person_lp["bio.guide.id"] = directory_match["bio.guide.id"]
                person_lp["gpo.id"] = directory_match["gpo.id"]
                if not person_lp["gpo.id"]:
                    if len(congmember['attrs'][0]['authorityid']) > 0:
                        person_lp["gpo.id"] = congmember['attrs'][p]['authorityid']
                        
                congress_member_lp["party"] = directory_match["party"]
                congress_member_lp["chamber"] = directory_match["chamber"]
                congress_member_lp["constituency.id"] = directory_match["constituency.id"]
                    
            else:
                directory_match = directory.loc[
                    (directory["name.fnf"].str.lower() == person_lp["full.name"].lower()) & 
                    (directory["congress"].astype(int) == int(congcommitte['attrs'][0]['congress']))
                ]

                if directory_match.shape[0]:
                    person_lp["bio.guide.id"] = directory_match["bio.guide.id"]
                    person_lp["gpo.id"] = directory_match["gpo.id"]
                    if not person_lp["gpo.id"]:
                        if len(congmember['attrs'][p]['authorityid']) > 0:
                            person_lp["gpo.id"] = congmember['attrs'][0]['authorityid']
                        
                    congress_member_lp["party"] = directory_match["party"]
                    congress_member_lp["chamber"] = directory_match["chamber"]
                    congress_member_lp["constituency.id"] = directory_match["constituency.id"]
                else:
                    if "authorityid" in congmember['attrs'][p] and  len(congmember['attrs'][p]['authorityid']) > 0:
                        person_lp["gpo.id"] = congmember['attrs'][p]['authorityid']
                    if "chamber" in congmember['attrs'][p] and len(congmember['attrs'][p]['chamber']) > 0:
                        congress_member_lp["chamber"] =  congmember['attrs'][p]['chamber']
                    if "party" in congmember['attrs'][p] and len(congmember['attrs'][p]['party']) > 0:
                        congress_member_lp["party"] =  congmember['attrs'][p]['party']
                
            if len(congmember['attrs'][p]["role"]) > 0:
                attendance_lp["person.id"] = person_id
                attendance_lp["hearing.id"] = hearing_id
                attendance_lp["role"] = congmember["attrs"][p]["role"]
            congress_member_lp["person.id"] = person_id
            
            #PERSON, ATTENDANCE, CONGRESSMEMBER
            #Line 417-455 in the R code will be dealt with here 
            person = person.append(pd.DataFrame(person_lp, index=[0]), ignore_index=True)
            attendance = attendance.append(pd.DataFrame(attendance_lp, index=[0]), ignore_index=True)
            congressmember = congressmember.append(pd.DataFrame(congress_member_lp, index=[0]), ignore_index=True)

    else:
        all_hearings.loc[(all_hearings["file.name"] == filename), "members"] = "NONE"

    #WITNESS
    if "witness" in mods_raw and len(mods_raw["witness"]["values"]) > 0:
        print("There are witnesses")
        witness_lp = {}
        person_lp = {}
        persons = mods_raw["witness"]["values"]
        style = None

        for p in range(len(persons)):
            style = 1 if int(mods_raw['congcommittee']["attrs"][0]["congress"]) <= 107 else 3
            witness_lp["full.title"] = persons[p]
            non_hon = re.sub("Judge |Dr. |Mr. |Mrs. |Ms. |Honorable |Hon. |, MD|, Jr.| Jr.", "",witness_lp["full.title"] )
            non_hon = re.sub(",\\s*,",",", non_hon)
            non_hon = re.sub("^,+|,+$","", non_hon)
            
            if (" Jr." in witness_lp["full.title"]): person_lp['honorific'] = "Jr."
            if ("Hon.|Honorable" in witness_lp["full.title"]): person_lp['honorific'] = "Hon."
            if ("Judge" in witness_lp["full.title"]): person_lp['honorific'] = "Judge"
            if (", MD" in witness_lp["full.title"]): person_lp['honorific'] = "MD"
            if ("Dr. " in witness_lp["full.title"]): person_lp['honorific'] = "Dr."
                
            print(non_hon)
            style = spaces_1 = spaces_2 = names_1 = names_2 = None
            spaces_1 = len(non_hon.split(", ")[0]) - len(re.sub(",\\s*,",",", non_hon.split(", ")[0]))
            q = "".join(non_hon.split(", ")[0].split(" ")).lower()
            r = q #TODO figure this part out
            names_1 = len(q) - len(r)

            if len(non_hon.split(", ")[0]) > 1:
                spaces_2 = len(non_hon.split(", ")[1]) - len(re.sub(",\\s*,",",", non_hon.split(", ")[1]))
                names_2 = None
            else:
                style = 3
            
            if spaces_1 == 0 and spaces_2 < 2: style = 1
            if spaces_1 > 2 and  spaces_2 <= 2: style = 2
            if "social security administration" in non_hon.lower(): style = 2
            if spaces_1 in [1,2] and spaces_2 < 2:style = 3
            
            if style == 1:
                person_lp['surname'] = non_hon.split(", ")[0]
                person_lp["first.name"] = non_hon.split(", ")[1]
                
                if len(person_lp["first.name"].split(" ")) > 1:
                    person_lp["middle.name"] =  person_lp["first.name"].replace(person_lp["first.name"].split(" ")[0], "").strip()
                    person_lp["first.name"] = person_lp["first.name"].split(" ")[0]
                    person_lp["full.name"] = " ".join([person_lp["first.name"], person_lp["middle.name"], person_lp["surname"]])
                else:
                    person_lp["full.name"] = " ".join([person_lp["first.name"], person_lp["surname"]])
            
                if len(non_hon.split(", ")) == 3:
                    witness_lp["organization"] = non_hon.split(", ")[2]
                    
                if len(non_hon.split(", ")) == 4:
                    witness_lp["job.title"] = non_hon.split(", ")[2]
                    witness_lp["organization"] = non_hon.split(", ")[3]

                if len(witness_lp["full.title"].split(", ")) > 4:
                    witness_lp["job.title"] = witness_lp["full.title"].split(", ")[2]
                    l = len(non_hon.split(", "))
                    org="xyz123"
                    for spot in range(l):
                        org = org + ", " + non_hon.split(", ")[spot]
                    org = org.replace("xyz123, ", "")
                    witness_lp["organization"] = org

                print(person_lp)
                print("_" * 10)
                print(witness_lp)
                break;
            ## Unpacking style 2: organization, full name, title or honorific*, function*
            if style == 2:
                witness_lp["organization"] = non_hon.split(", ")[0]
                person_lp["full.name"] =  person_lp["full.name"].split(" ")[0]
                person_lp['first.name'] = non_hon.split(", ")[0]
                person_lp["surname"] = person_lp['first.name'].split(" ")[0] #Buggy
                
                if len(person_lp["ful;.name"].split(" ") > 2):
                    person_lp["middle.name"] =  person_lp["full.name"].replace(person_lp["first.name"].split(" ")[0], "").strip()
                    person_lp["middle.name"] = person_lp["middle.name"].replace(person_lp["surname"], "").strip()
                
                if len(non_hon.split(", ")) == 3:
                    witness_lp["organization"] = non_hon.split(", ")[2]
                    
                if len(non_hon.split(", ")) > 3:
                    l = len(non_hon.split(", "))
                    org="xyz123"
                    for spot in range(l):
                        org = org + ", " + witness_lp["full.title"].split(", ")[spot]
                    org = org.replace("xyz123, ", "")
                    witness["job.title"] = org
            
            #Unpacking style 3: fullname, function*, organization*, city*, country or territory*
            if style == 3:
                person_lp["full.name"] = non_hon.split(", ")[0]
                person_lp["first.name"] = person_lp["full.name"] #buggy
                person_lp["surname"] = person_lp["full.name"]
                
                if len(person_lp["full.name"].split(" ")) > 2:
                    person_lp["middle.name"] = person_lp["full.name"].replace(person_lp["first.name"], "")
                    person_lp["middle.name"] = person_lp["middle.name"].replace(person_lp["surname"], "").strip()
                    
                if len(non_hon.split(", ")) == 2:
                    witness_lp["organization"] = non_hon.split(", ")[1]
                
                if len(non_hon.split(", ")) == 3:
                    witness_lp["job.title"] = non_hon.split(", ")[1]
                    witness_lp["organization"] = non_hon.split(", ")[2]
                
                if len(non_hon.split(", ")) > 3:
                    witness_lp["job.title"] = non_hon.split(", ")[2]
                    l = len(non_hon.split(", "))
                    org="xyz123"
                    for spot in range(l):
                        org += org + ", " + non_hon.split(", ")[spot]
                    org = org.replace("xyz123, ", "")
                    witness_lp["organization"] = org
        
        person_lp["full.name"] = person_lp["full.name"].lower()
        person_lp["first.name"] = person_lp["first.name"].lower()
        person_lp["middle.name"] = person_lp["middle.name"].lower()
        person_lp["surname"] = person_lp["surname"].lower()
        person_lp["honorific"] = person_lp["honorific"].lower()
        
        witness_lp["organization"] = witness_lp["organization"].lower()
        witness_lp["job.title"] = witness_lp["job.title"].lower()
        witness_lp["full.title"] = witness_lp["full.title"].lower()
        
        attendance_lp = {}
        #check if person exists in Persons table
        person_match = person.loc[(person["full.name"] == person_lp["full.name"])]
        if person_match.shape[0]:
            witness_lp["person.id"] = person_match['id']
            attendance["person.id"] = person_match['id']
            attendance["hearing.id"] = hearing_id
            attendance = attendance.append(pd.DataFrame(attendance_lp, index=[0]), ignore_index=True)
        else:
            person_lp["person_id"] = person_id + 1
            person = person.append(pd.DataFrame(person_lp, index=[0]), ignore_index=True)
        witness = witness.append(pd.DataFrame(witness_lp, index=[0]), ignore_index=True)
            
            
    #SPEECHES, SPEAKERS
    speech_counter = 0
    hearing_raw = pd.read_json(path)
    if len(hearing_raw) > 0:
        all_hearings.loc[(all_hearings["file.name"] == filename), "text"] = "GOT SOME"
        speech_id += 1
        
        for con in range(len(hearing_raw)):
            for sp in range(len(hearing_raw[con])):
                if not hearing_raw[con][sp]: continue
                speech_lp = {}
                speech_lp["speech.id"] = speech_id
                speech_lp["previous.speech.id"] = speech_id - 1
                speech_lp["hearing.id"] = hearing_id
                speech_lp["statement.type"] = np.nan
                speech_lp["conversation"] = con
                speech_lp["text"] = hearing_raw[con][sp]["speech"]
                speech = speech.append(pd.DataFrame(speech_lp, index=[0]), ignore_index=True)
        
                attendance_lp = attendance.loc[(attendance["hearing.id"] == hearing_id)]
                attendance_lp = pd.merge(attendance_lp, person[['person.id', 'surname']], on=["person.id"], right_index=True)

                pids = np.unique(attendance_lp.loc[(attendance_lp["surname"].str.lower() == hearing_raw[con][sp]["surname"])]["person.id"]).tolist()
                pids = ";".join([str(x) for x in pids])
                if not pids: pids = -999
                if pids == ";": pids = -222
                    
                speaker_lp = {}
                speaker_lp["speech.id"] = speech_id
                speaker_lp["surname"] = hearing_raw[con][sp]["surname"]
                speaker_lp["person.id"] = pids
                speaker = speaker.append(pd.DataFrame(speaker_lp, index=[0]), ignore_index=True)
    else:
        all_hearings.loc[(all_hearings["file.name"] == filename), "text"] = "EMPTY"
    if i == 100: break

There are witnesses
Amparo, Alex, Assistant Administrator for Recovery, Office of Response and Recovery, Federal Emergency Management Agency, U.S. Department of Homeland Security
{'surname': 'Amparo', 'first.name': 'Alex', 'full.name': 'Alex Amparo'}
__________
{'full.title': 'Amparo, Alex, Assistant Administrator for Recovery, Office of Response and Recovery, Federal Emergency Management Agency, U.S. Department of Homeland Security', 'job.title': 'Assistant Administrator for Recovery', 'organization': 'Amparo, Alex, Assistant Administrator for Recovery, Office of Response and Recovery, Federal Emergency Management Agency, U.S. Department of Homeland Security'}


#### Storing the data

In [0]:
person.to_csv(FILE_PATH + "database/csv/person/person - the only file.csv")
committee.to_csv(FILE_PATH + "database/csv/committee/committee - the only file.csv")
constituency.to_csv(FILE_PATH + "database/csv/constituency/constituency - the only file.csv")
constituency_characteristics.to_csv(FILE_PATH + "database/csv/constituency_characteristics/constituency_characteristics - the only file.csv")
zip_codes.to_csv(FILE_PATH + "database/csv/zip_codes/zip_codes - the only file.csv")
congressmember.to_csv(FILE_PATH + "database/csv/congressmember/congressmember - the only file.csv")
witness.to_csv(FILE_PATH + "database/csv/witness/witness - the only file.csv")
speech.to_csv(FILE_PATH + "database/csv/speech/speech - the only file.csv")
speaker.to_csv(FILE_PATH + "database/csv/speaker/speaker - the only file.csv")
attendance.to_csv(FILE_PATH + "database/csv/attendance/attendance - the only file.csv")
bill.to_csv(FILE_PATH + "database/csv/bill/bill - the only file.csv")
law.to_csv(FILE_PATH + "database/csv/law/law - the only file.csv")
related.to_csv(FILE_PATH + "database/csv/related/related - the only file.csv")
us_code.to_csv(FILE_PATH + "database/csv/us_code/us_code - the only file.csv")