# Full script for parsing the MODS files

The first field below defines the functions we are going to use further down.


## the parser

In [None]:
import re
import json
import os
import csv

"""
The class to extract all the useful fields from the XML file.
Method "extractData" gets the info from the XML file.
Method "prettyPrint" prints it.
Method "saveData" saves to a file.
"""
class extractData:
    
    def __init__(self):
        self.data = {}
        self.preamble = True
        self.congCommittee = False
        self.congMember = False
        self.location = False

        self.openingTags = ["<type>","<chamber>","<session>","<heldDate>","<searchTitle>","<witness>"]
        self.closingTags = ["</type>","</chamber>","</session>","</heldDate>","</searchTitle>","</witness>"]
        self.labels = ["hearingType","chamber","session","hearingDate","title","witnesses"]
        self.prefixes = {"Mr.", "Ms.", "Mrs.", "Dr.", "Prof."}
        self.congMemberWords = ["authorityId=", "chamber=", "congress=", "party=", "role=", "state="]
        self.translate = {"authorityId=":"id", "chamber=":"chamber", "congress=": "memberSession", "party=":"party","role=":"role","state=":"stateAbbr"}    

    
# 2 useful functions to return integer or string.
  
    def makeInt(self,word):
        s = ""
        for letter in word:
              if letter in "0123456789": 
                    s += letter
        return int(s)    

    def onlyLetters(self,word):
        s = ""
        for letter in word:      
            if letter.lower() in "abcdefghijklmnopqrstuvwxyz":       
                s += letter
        return s

    def findIndex(self,word,lst):
        indices = [e for e,x in enumerate(lst) if word in x]
        idx = -1
        if len(indices) > 0: 
            idx = indices[0]
    
        return idx    

    def lettersNumbers(self,text):
        alp = "abcdefghijklmnopqrstuvwxyz0123456789"    
        s = ""
        for letter in text:
            if letter.lower() in alp:
                s += letter
                
        return s    
    
    def extract(self,fn):
        f = open(fn)
        line = f.readline().strip()  
        while len(line) > 0:
            if "<congCommittee" in line:
                self.preamble = False
                self.congCommittee = True
            
            if "</congCommittee>" in line:
                self.congCommittee = False
    
            if "<congMember" in line:
                self.congMember = True
  
            if "<location>" in line:
                self.location = True

            if "</location>" in line:
                self.location = False
            
            if self.location:
                if '<url access="object in context"' in line:                
                    command = "(%s)(.*)(%s)"%('<url access="object in context" displayLabel="Content Detail">','</url>')      
                    check = re.match(command, line)    
                    if check:
                        self.data["url"] = check.group(2)
                                    
            if self.congMember:  
                if "<congMember" in line:
                    tempCM = {}                                            
                    A = line.split()
                    for word in self.congMemberWords:
                        if word in line:
                            idx = self.findIndex(word,A)
                            if idx > -1:                                
                                tempCM[self.translate[word]] = self.onlyLetters(A[idx].split('=')[1])
                                if word in ["authorityId=","congress="]:
                                    tempCM[self.translate[word]] = self.makeInt(A[idx].split('=')[1])

                
                else:
                    command = "(%s)(.*)(%s)"%('<name type="parsed">','</name>')      
                    check = re.match(command, line)    
                    if check:
                        temp = check.group(2).split("of")
                        tempCM["state"] = temp[-1].strip()
                        tempCM["fullName"] = temp[0].strip()

                    command = "(%s)(.*)(%s)"%('<name type="authority-fnf">','</name>')      
                    check = re.match(command, line)    
                    if check:
                        tempCM["firstName"] = self.onlyLetters(check.group(2).split()[0])

                    command = "(%s)(.*)(%s)"%('<name type="authority-lnf">','</name>')      
                    check = re.match(command, line)    
                    if check:
                        tempCM["lastName"] = self.onlyLetters(check.group(2).split()[0])
                
                if "</congMember>" in line:
                    if "congressMembers" not in self.data: self.data["congressMembers"] = [tempCM]
                    else: self.data["congressMembers"].append(tempCM)  
        
                    tempCM = {}  
                    congMember = False


            if self.congCommittee:
                
                
                if "<congCommittee authorityId=" in line:
                    A = line.split()
                    idx = self.findIndex("authorityId",A)
                    if idx > -1:
                      self.data["committeeID"] = self.lettersNumbers(A[idx].split('=')[1])
                    
                    # yummy
                    idx = self.findIndex("congress",A)
                    if idx > -1:
                      self.data["hearingSessionID"] = self.makeInt(A[idx].split('=')[1])


                command = "(%s)(.*)(%s)"%('<name type="authority-standard">','</name>')
                check = re.match(command, line)    
                if check:
                    result = check.group(2)
                    self.data["committeeName"] = result
                    self.data["committeeType"] = result.split()[0]


            if self.preamble: 
                for openingTag,closingTag,label in zip(self.openingTags,self.closingTags,self.labels):    
                    command = "(%s)(.*)(%s)"%(openingTag,closingTag)    
                    check = re.match(command, line)    
                    if check:
                        result = check.group(2)
                        if label == "session": result = int(result)
                        if label in "witnesses":
                            temp = {}
                            A = result.split(',')
                            if len(A) > 0:
                              B = A[0].split()
                              if len(B) > 0:
                                if B[0] in self.prefixes: temp["honorific"] = B[0]
                                if len(B) > 1:  
                                  temp["firstName"] = B[1]
                                  temp["lastName"] = B[-1]

                            if len(A) > 1: temp["title"] = A[1]
                            if len(A) > 2: temp["organization"] = A[2]

                            if label not in self.data: self.data[label] = [temp]
                            else: self.data[label].append(temp)  

                        else: 
                            self.data[label] = result
           
            line = f.readline().strip()          
    
        f.close()
        return self.data


    def prettyPrint(self,theData):
        
        if "url" in theData:
            print("Url: %s"%theData["url"])
            
        if "committeeID" in theData:
            print("Committee ID: %s"% theData["committeeID"])

        if "hearingSessionID" in theData:
            print("Congress Session per hearing: %s"%theData["hearingSessionID"])

        if "hearingType" in theData:
            print("Hearing type: %s"%theData["hearingType"])

        if "chamber" in theData:    
            print("Chamber: %s"%theData["chamber"])

        if "session" in theData:    
            print("Session: %d"%theData["session"])

        if "hearingDate" in theData:    
            print("Hearing Date: %s"%theData["hearingDate"])

        if "title" in theData:    
            print("Title: %s"%theData["title"])

        print("\n"[:-1])

        if "witnesses" in theData:
            print("Witnesses:")
            print("\n"[:-1])
            for witness in theData["witnesses"]:
                if "honorific" in witness:
                    print("  Honorific: %s"%witness["honorific"])
                if "firstName" in witness:  
                    print("  First name: %s"%witness["firstName"])
                if "lastName" in witness:  
                    print("  Last name: %s"%witness["lastName"]  )
                if "title" in witness:  
                    print("  Title: %s"%witness["title"]    )
                if "organization" in witness:  
                    print("  Organization: %s"%witness["organization"]    )
                print("\n"[:-1])

        if "committeeType" in theData:
            print("Committee type: %s"%theData["committeeType"])
        
        if "committeeName" in theData:  
            print("Committee name: %s"%theData["committeeName"])

        print("\n"[:-1])

        if "congressMembers" in theData:
            print("Congress Members:")
            print("\n"[:-1])

            for congMember in theData["congressMembers"]:
                if "id" in congMember:
                    print("  ID: %d"%congMember["id"])
                if "memberSession" in congMember:
                    print("  Congress session per member: %d"%congMember["memberSession"])
                if "chamber" in congMember:  
                    print("  Chamber: %s"%congMember["chamber"])
                if "party" in congMember:  
                    print("  Party: %s"%congMember["party"]  )
                if "role" in congMember:  
                    print("  Role: %s"%congMember["role"]  )
                if "stateAbbr" in congMember:  
                    print("  State Abbr.: %s"%congMember["stateAbbr"])
                if "state" in congMember:  
                    print("  State: %s"%congMember["state"]  )
                if "fullName" in congMember:
                    print("  Full name: %s"%congMember["fullName"])
                if "firstName" in congMember:  
                    print("  First name: %s"%congMember["firstName"])
                if "lastName" in congMember:  
                    print("  Last name: %s"%congMember["lastName"])
                print("\n"[:-1])


    def saveData(self,outfn,data):
        f = open(outfn,"w")
        f.write(json.dumps(data))
        f.close()
       
    def saveTsv(self,outfn,data):    
        f = open(outfn, "w")
        keys = ['url', 'hearingSessionID', 'title',  'committeeType', 'committeeName', 'committeeID', 'hearingDate', 'chamber', 'session',  'hearingType']      

        cols = ""
        for x in keys: cols += x + '\t'
        cols = cols[:-2]
        f.write(cols)
        f.write("\n")

        for d in data:
            for k in keys:
                if k in d: val = d[k]
                else: val = "N/A"  
                f.write('%s\t'%str(val))

            f.write("\n")  

        f.close()


## Loading and storing the data

In [None]:
# Set this folder to your working directory
os.chdir("your working directory")

# Folder where input files are stored.
inPath = "hearings"

# Extract data for all files
batch = True


if batch:
  files = [each for each in os.listdir(inPath) if each.endswith('.xml')]
  
else:
  files = [fileName]

print("Number of files found:")
print(len(files))
  

In [None]:
# Get the data.
data = []
for fileName in files:  
    fn = os.path.join(inPath,fileName)
    data.append(extractData().extract(fn))

In [None]:
"""
Save the data.
"""
# Folder where output files will be stored.
outPath = "json"

for f,d in zip(files,data):  
    outfn = os.path.join(outPath,f)
    outfn = outfn.split('.')[0] + ".json"
    print("Saving %s"%outfn)
    extractData().saveData(outfn,d)