In [36]:
import csv
import json
import base64
import io
import re
import logging
import sys
from pypdf import *
from bs4 import BeautifulSoup
from pdfreader import PDFDocument, SimplePDFViewer


from urllib.request import urlopen

key = "55228d9d4f216ad5972abd1be3c4c2e3"

In [37]:
def get_bill_ids(filePath):
    """Get bill ids from a csv file"""

    bill_ids = []
    with(open(filePath, 'r')) as f:
        reader = csv.reader(f)
        for row in reader:
            bill_ids.append(row[0])
    return bill_ids        

In [38]:
VA_bill_ids = get_bill_ids("VA/2024-2024_Regular_Session/csv/bills.csv")
PA_bill_ids = get_bill_ids("PA/2023-2024_Regular_Session/csv/bills.csv")
MN_bill_ids = get_bill_ids("MN/2023-2024_93rd_Legislature/csv/bills.csv")
CA_bill_ids = get_bill_ids("CA/2023-2024_Regular_Session/csv/bills.csv")

In [39]:
VA_bill_ids[:10]

['bill_id',
 '1785700',
 '1785447',
 '1785578',
 '1785663',
 '1785638',
 '1785434',
 '1785734',
 '1785841',
 '1785996']

In [40]:
def get_text_from_bill_id(bill_id):
    """API pull to get text from bill id
    returns None if bill id is not found
    returns empy string if no text is unavailable
    returns encoded text (with base64) if available
    """
    
    urlGetBill = f"https://api.legiscan.com/?key={key}&op=getBill&id={bill_id}"
    r1 = urlopen(urlGetBill).read().decode()
    data1 = json.loads(r1)
    if data1["status"] == "ERROR":
        return None
    if data1["bill"]["texts"] == []:
        return ""
    else:
        doc_id = data1["bill"]["texts"][0]["doc_id"]
        urlGetText = f"https://api.legiscan.com/?key={key}&op=getBillText&id={doc_id}"
        r2 = urlopen(urlGetText).read().decode()
        data2 = json.loads(r2)
        textEncoded = data2["text"]["doc"]
        return textEncoded
    
def clean_text(text):
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    pattern = r'\b(\d+\s+)+\d+\b'
    text = re.sub(pattern, '', text) 
    # Regular expression to match sequences like '- 2 -'
    dash_number_pattern = r'-\s*\d+\s*-'
    text = re.sub(dash_number_pattern, '', text)
    return text    

In [41]:
PA_bill_ids[1]

'1724272'

In [42]:
# HTML text to plain text https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
data = get_text_from_bill_id(VA_bill_ids[2])
decoded = io.BytesIO(base64.b64decode(data))
soup = BeautifulSoup(decoded, 'html.parser')
for script in soup(["script", "style"]):
    script.extract()

text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
text = '\n'.join(chunk for chunk in chunks if chunk)
type(text)
# with open("HTML2TextTest.txt", "w") as f:
#     f.write(text)
 
def decodeHTMLTextToPlainText(data):
    decoded = io.BytesIO(base64.b64decode(data))
    soup = BeautifulSoup(decoded, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = clean_text(text)
    return text  

text = decodeHTMLTextToPlainText(data)
with open("HTML2TextTest.txt", "w") as f:
    f.write(text) 

In [43]:
#From the other guy
def html_decoder(base64_decoded):
    """ Decode a base64 representation of a html type bill into string"""
    bs = BeautifulSoup(base64_decoded)

    for p in bs.find_all('p'):
        if p.string:
            p.string.replace_with(p.string.strip())
        
    # strip white space
    bill_text = re.sub(r'\n\s*\n', r'\n', bs.get_text().strip())

    return bill_text

testData = base64.b64decode(data)
bill_text = html_decoder(testData)
bill_text = clean_text(bill_text)
with open("HTML2TextTestNew.txt", "w") as f:
    f.write(bill_text)

In [44]:
data = get_text_from_bill_id(MN_bill_ids[1])
decoded = base64.b64decode(data)
pdfBuffer = io.BytesIO(decoded)
reader = PdfReader(pdfBuffer)
print(len(reader.pages))
# with open("PAPDF2Text.txt", "w") as f:
#     for i in range(len(reader.pages)):
#         page = reader.pages[i]
#         f.write(page.extract_text())

def decodePDFTextToPlainText(data):
    decoded = base64.b64decode(data)
    pdfBuffer = io.BytesIO(decoded)
    reader = PdfReader(pdfBuffer)
    text = ""
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        text += page.extract_text()

    text = clean_text(text)
    return text




text = decodePDFTextToPlainText(data)
with open("MNPDF2Text.txt", "w") as f:
    f.write(text)

2


In [45]:
#From the other guy
def _extract_strings_per_page(p, viewer):
    """navigate into specific page p and render its content into a single string"""
    viewer.navigate(p)
    viewer.render()
    strings = viewer.canvas.strings
    page_content = " ".join(strings)

    return page_content

def pdf_decoder(base64_decoded):
    """Decodes a base64 representation of pdf file into string"""
    try:
        pdf_doc = PDFDocument(base64_decoded)
    except: 
        logging.error('encountered error {}'.format(sys.exc_info()[0]))
        logging.warning('Returning empty string for bill content')

        return ''

    all_pages = len([p for p in pdf_doc.pages()])

    viewer = SimplePDFViewer(base64_decoded)
    content = []

    for page in range(1, (all_pages + 1)):
        print(page)
        content.append(_extract_strings_per_page(page, viewer))

    bill_text = " ".join(content)

    # Remove sequences of numbers
    bill_text = remove_number_sequences(bill_text)

    return bill_text

def remove_number_sequences(text):
    # Regular expression to match sequences of numbers separated by spaces
    pattern = r'\b(\d+\s+)+\d+\b'
    # Replace the matched sequences with an empty string
    cleaned_text = re.sub(pattern, '', text)
    # Return the cleaned text
    return cleaned_text

data = get_text_from_bill_id(PA_bill_ids[1])
decoded = base64.b64decode(data)
bill_text= pdf_decoder(decoded)

with open("PAPDF2TextTestMethod.txt", "w") as f:
    f.write(bill_text)

1
2


In [46]:
#Create a CSV file with the bill_id and the encoded text
bill_texts = []
bill_texts.append(["state","bill_id", "text"])
count = 0
for bill_id in VA_bill_ids: #skip the first line since its the header
    if bill_id == "bill_id":
        continue
    billText = get_text_from_bill_id(bill_id)
    count += 1
    if count % 100 == 0:
        print(count)
    bill_texts.append(["VA", bill_id, billText]) #If empty string then text was unavailable on LegiScan

# for bill_id in PA_bill_ids: #skip the first line since its the header
#     if bill_id == "bill_id":
#         continue
#     billText = get_text_from_bill_id(bill_id)
#     bill_texts.append(["PA", bill_id, billText]) #If empty string then text was unavailable on LegiScan
    
with open("bill_texts.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(bill_texts)
        


KeyboardInterrupt: 

In [69]:
#Final methods to move to CRC
import csv
import string
import json
import base64
import io
import re
import logging
import sys
from pypdf import *
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

def get_bill_ids(filePath):
    """Get bill ids from a csv file"""

    bill_ids = []
    with(open(filePath, 'r')) as f:
        reader = csv.reader(f)
        for row in reader:
            bill_ids.append(row[0])
    return bill_ids 

def get_text_from_bill_id(bill_id):
    """API pull to get text from bill id
    returns None if bill id is not found
    returns empy string if no text is unavailable
    returns encoded text (with base64) if available
    """
    
    urlGetBill = f"https://api.legiscan.com/?key={key}&op=getBill&id={bill_id}"
    r1 = urlopen(urlGetBill).read().decode()
    data1 = json.loads(r1)
    if data1["status"] == "ERROR":
        return None
    if data1["bill"]["texts"] == []:
        return ""
    else:
        doc_id = data1["bill"]["texts"][0]["doc_id"]
        urlGetText = f"https://api.legiscan.com/?key={key}&op=getBillText&id={doc_id}"
        r2 = urlopen(urlGetText).read().decode()
        data2 = json.loads(r2)
        textEncoded = data2["text"]["doc"]
        return textEncoded
    
def clean_text(text):
    """
    Clean the text by removing extra white spaces and sequences of numbers
    """
    pattern = r'\b(\d+\s+)+\d+\b'
    text = re.sub(pattern, '', text) 
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\s+', ' ', text) 
    # Regular expression to match sequences like '- 2 -'
    dash_number_pattern = r'-\s*\d+\s*-'
    text = re.sub(dash_number_pattern, '', text)
    return text  

def html_decode(data):
    # HTML text to plain text https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python
    """
    Decode a base64 representation of a html type bill into string
    """
    decoded = io.BytesIO(base64.b64decode(data))
    soup = BeautifulSoup(decoded, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    text = clean_text(text)
    return text 

def pdf_decode(data):
    """
    Decode a base64 representation of a pdf type bill into string
    """
    decoded = base64.b64decode(data)
    pdfBuffer = io.BytesIO(decoded)
    reader = PdfReader(pdfBuffer)
    text = ""
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        text += page.extract_text()

    text = clean_text(text)
    return text

def preprocess_text(text):
    """
    Preprocess the text by removing punctuation, converting to lowercase, tokenizing, and removing stopwords
    """
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]
    # Join words back into a single string
    cleaned_text = " ".join(filtered_words)

    weird_pattern = r'\s*§+[\w\d]+\s*' # Remove weird characters
    cleaned_text = re.sub(weird_pattern, '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # Remove extra white spaces

    cleaned_text = clean_text(cleaned_text) #one last clean

    return cleaned_text

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chess\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sample Texts to see how it is formated:

MN and PA uses pdf_decode
VA and CA uses html_decode

Sample Texts:
CAhtml2TextPreprocessTest.txt
VAhtml2TextPreprocessTest.txt
MNpdf2TextPreprocessTest.txt
PApdf2TextPreprocessTest.txt


In [70]:
VA_bill_ids = get_bill_ids("VA/2024-2024_Regular_Session/csv/bills.csv")
PA_bill_ids = get_bill_ids("PA/2023-2024_Regular_Session/csv/bills.csv")
MN_bill_ids = get_bill_ids("MN/2023-2024_93rd_Legislature/csv/bills.csv")
CA_bill_ids = get_bill_ids("CA/2023-2024_Regular_Session/csv/bills.csv")

data = get_text_from_bill_id(MN_bill_ids[2])
text = pdf_decode(data)
preprocess_text = preprocess_text(text)
with open("MNpdf2TextPreprocessTest.txt", "w") as f:
    f.write(preprocess_text)