In [215]:
import glob
from PyPDF2 import PdfReader
import pandas as pd
import numpy as np
import docx
from striprtf.striprtf import rtf_to_text
import sys
import os
from bs4 import BeautifulSoup
import comtypes.client #throwing error
import shutil
import win32com.client
import pytesseract
from PIL import Image
from docx.api import Document
import time
import regex as re
max_rec = 0x100000
sys.setrecursionlimit(max_rec)
pytesseract.pytesseract.tesseract_cmd = "C:\\Users\\HaoLuo\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe"
st = time.time()

In [224]:
def scrape_html(soup):
  '''retrieve relevant content from b4s'''

  #init empty dict
  html_dict = dict()

  #put all 'b' tags into dictionary to capture meta data
  for tag in soup.find_all(name='b'):
    html_dict[tag.contents[0].split(':')[0]] = tag.next_sibling.string

  #comment is two siblings behind heading
    if soup.find(name='h2',string="General Comment") is not None:
        html_dict["Comment"] = soup.find(name='h2',string="General Comment").next_sibling.next_sibling.text
    elif soup.find(name='h2',string="Redacted Comment") is not None:
        html_dict["Comment"] = soup.find(name='h2',string="Redacted Comment").next_sibling.next_sibling.text
        
  #We only care if attachment exists. Attachment must be grabbed using the comment name, not the attachment name
  if soup.find(name='h2',string="Attachments"):
    html_dict["has_attachments"] = True
  else:
    html_dict["has_attachments"] = False

  return html_dict

def scrape_pdf(file_name):
  ''' scrape pdf '''

  reader = PdfReader(file_name)
  text = ""
  for page in reader.pages:
      text += page.extract_text() + "\n"

  return text

def scrape_rtf(filename):
    with open(filename) as infile:
        content = infile.read()
        text = rtf_to_text(content)
    return text

def scrape_txt(filename):
    with open(filename) as infile:
        text = infile.read()
    return text

def scrape_docx(file_name):
    doc = docx.Document(file_name)
 
    #pull in pararaphs
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
 
    joinedText = '\n'.join(fullText)
 
    if len(doc.tables)>0:
      #pull in tables afterwards
      fullTable = ""
      for table in doc.tables:
          for row in table.rows:
              row_data = [cell.text for cell in row.cells]
              fullTable += ", ".join(row_data) + "\n"
      return (joinedText + '\n ' + fullTable, True)
    else:
      return (joinedText, False)

def scrape_xlsx(file_name):
  #pulls in a dictionary object of all tabs
  all_tabs_dict = pd.read_excel(file_name, na_values='', sheet_name=None)
 
  fullText = ""
  for df in list(all_tabs_dict.values()):
    for i in range(df.shape[0]):
        row_data = [str(df.iloc[i,j]) for j in range(df.shape[1])]
        fullText += ", ".join(row_data) + "\n"
 
  #remove multiple commas resulting from blank cells      
  fullText = re.sub(r',{2,}', ',', fullText)
  return fullText

def scrape_png(file_name):
    text = str(pytesseract.image_to_string(Image.open(file_name)))
    return text

In [225]:
def get_dataframe():    
    path ='C:/Users/HaoLuo/ACF-2023-0011/ACF-2023-0011' # set your own path
    files = os.listdir(path)
    os.chdir(path)
    set([x.split('.')[-1] for x in files])

    files_html = [x for x in files if x.endswith('.html') and x.startswith("ACF-")]
    html_dict_list=[]
    for file_path in files_html:
      #load html
      with open(path+"/"+file_path, 'r') as file:
        html_string=file.read()
        soup = BeautifulSoup(html_string, 'html.parser')
        print(file_path)
      #clean if public submission
      if soup.find_all(name='h1')[0].contents == ["PUBLIC SUBMISSION"]:
        html_dict = scrape_html(soup)

        #pull file paths for matching attachments
        attachment_paths = [x for x in files if html_dict['Document'] in x and not x.endswith(".html")]
        attachment_found = False
        html_dict["attachment_count"]=len(attachment_paths) #new line

        for attachment_path in attachment_paths:
            html_dict["attachment_name"] = attachment_path
            if attachment_path.endswith('.pdf'):
                html_dict["attachment_text"] = scrape_pdf(path+"/"+attachment_path)
                html_dict_list.append(html_dict.copy())
                attachment_found = True
            elif attachment_path.endswith('.docx'):
                html_dict["attachment_text"], html_dict["attachment_has_tables"] = scrape_docx(path+"/"+attachment_path) # new line
                html_dict["attachment_error"] = (len("attachment_text")==0)
                html_dict_list.append(html_dict.copy())
                attachment_found = True
            elif attachment_path.endswith('.rtf'):
                html_dict["attachment_text"] = scrape_rtf(path+"/"+attachment_path)
                html_dict_list.append(html_dict.copy())
                attachment_found = True
            elif attachment_path.endswith('.txt'):
                html_dict["attachment_text"] = scrape_txt(path+"/"+attachment_path)
                html_dict_list.append(html_dict.copy())
                attachment_found = True
            elif attachment_path.endswith('.png'):
                html_dict["attachment_text"] = scrape_png(path+"/"+attachment_path)
                html_dict_list.append(html_dict.copy())
                attachment_found = True
            elif attachment_path.endswith('.xlsx'):
                html_dict["attachment_text"] = scrape_xlsx(path+"/"+attachment_path)
                html_dict_list.append(html_dict.copy())
                attachment_found = True

        if attachment_found == False:
          #add row without attachment text
          html_dict_list.append(html_dict)

    df=pd.DataFrame(html_dict_list)
    
    for col_name in ["Comment","attachment_text"]:
      col_name_new = col_name+"_clean"

      #text cleaning
      df[col_name_new] = df[col_name].replace(to_replace=float('nan'),value='')

      df[col_name_new] = df[col_name_new].replace(to_replace='\xa0',value='')
      df[col_name_new] = df[col_name_new].str.replace(pat='§§',repl='bill number ')
      df[col_name_new] = df[col_name_new].str.replace(pat='§',repl='bill number ')
      df[col_name_new] = df[col_name_new].str.replace(pat='\u2009',repl='')

      df[col_name_new] = df[col_name_new].apply(lambda x: x.strip())
      df[col_name_new] = df[col_name_new].apply(lambda x: re.sub(r' +', ' ', x))
      df[col_name_new] = df[col_name_new].apply(lambda x: re.sub(r'\n +', '\n', x))
      df[col_name_new] = df[col_name_new].apply(lambda x: re.sub(r'\n+', '\n', x))
    return df

In [226]:
dff = get_dataframe()
congress = pd.read_excel("2024_jan_congress_members.xlsx")
congress_names = congress.Name.to_list()
 
dff["head_start_commenter"] = dff.Organization.str.contains(pat=r"Head Start.*?Association",case=False)
dff["congress_commenter"] = dff["Name"].apply(lambda x: x in congress_names)
dff.head()

ACF-2023-0011-0002.html
ACF-2023-0011-0003.html
ACF-2023-0011-0004.html
ACF-2023-0011-0005.html
ACF-2023-0011-0006.html
ACF-2023-0011-0007.html
ACF-2023-0011-0008.html
ACF-2023-0011-0009.html
ACF-2023-0011-0010.html
ACF-2023-0011-0011.html
ACF-2023-0011-0012.html
ACF-2023-0011-0013.html
ACF-2023-0011-0014.html
ACF-2023-0011-0015.html
ACF-2023-0011-0016.html
ACF-2023-0011-0017.html
ACF-2023-0011-0018.html
ACF-2023-0011-0019.html
ACF-2023-0011-0020.html
ACF-2023-0011-0021.html
ACF-2023-0011-0022.html
ACF-2023-0011-0023.html
ACF-2023-0011-0024.html
ACF-2023-0011-0025.html
ACF-2023-0011-0026.html
ACF-2023-0011-0027.html
ACF-2023-0011-0028.html
ACF-2023-0011-0029.html
ACF-2023-0011-0030.html
ACF-2023-0011-0031.html
ACF-2023-0011-0032.html
ACF-2023-0011-0033.html
ACF-2023-0011-0034.html
ACF-2023-0011-0035.html
ACF-2023-0011-0036.html
ACF-2023-0011-0037.html
ACF-2023-0011-0038.html
ACF-2023-0011-0039.html
ACF-2023-0011-0040.html
ACF-2023-0011-0041.html
ACF-2023-0011-0042.html
ACF-2023-0011-00

ACF-2023-0011-0344.html
ACF-2023-0011-0345.html
ACF-2023-0011-0346.html
ACF-2023-0011-0347.html
ACF-2023-0011-0348.html
ACF-2023-0011-0349.html
ACF-2023-0011-0350.html
ACF-2023-0011-0351.html
ACF-2023-0011-0352.html
ACF-2023-0011-0353.html
ACF-2023-0011-0354.html
ACF-2023-0011-0355.html
ACF-2023-0011-0356.html
ACF-2023-0011-0357.html
ACF-2023-0011-0358.html
ACF-2023-0011-0359.html
ACF-2023-0011-0360.html
ACF-2023-0011-0361.html
ACF-2023-0011-0362.html
ACF-2023-0011-0363.html
ACF-2023-0011-0364.html
ACF-2023-0011-0365.html
ACF-2023-0011-0366.html
ACF-2023-0011-0367.html
ACF-2023-0011-0368.html
ACF-2023-0011-0369.html
ACF-2023-0011-0370.html
ACF-2023-0011-0371.html
ACF-2023-0011-0372.html
ACF-2023-0011-0373.html
ACF-2023-0011-0374.html
ACF-2023-0011-0375.html
ACF-2023-0011-0376.html
ACF-2023-0011-0377.html
ACF-2023-0011-0378.html
ACF-2023-0011-0379.html
ACF-2023-0011-0380.html
ACF-2023-0011-0381.html
ACF-2023-0011-0382.html
ACF-2023-0011-0383.html
ACF-2023-0011-0384.html
ACF-2023-0011-03

ACF-2023-0011-0687.html
ACF-2023-0011-0688.html
ACF-2023-0011-0689.html
ACF-2023-0011-0690.html
ACF-2023-0011-0691.html
ACF-2023-0011-0692.html
ACF-2023-0011-0693.html
ACF-2023-0011-0694.html
ACF-2023-0011-0695.html
ACF-2023-0011-0696.html
ACF-2023-0011-0697.html
ACF-2023-0011-0698.html
ACF-2023-0011-0699.html
ACF-2023-0011-0700.html
ACF-2023-0011-0701.html
ACF-2023-0011-0702.html
ACF-2023-0011-0703.html
ACF-2023-0011-0704.html
ACF-2023-0011-0705.html
ACF-2023-0011-0706.html
ACF-2023-0011-0707.html
ACF-2023-0011-0708.html
ACF-2023-0011-0709.html
ACF-2023-0011-0710.html
ACF-2023-0011-0711.html
ACF-2023-0011-0712.html
ACF-2023-0011-0713.html
ACF-2023-0011-0714.html
ACF-2023-0011-0715.html
ACF-2023-0011-0716.html
ACF-2023-0011-0717.html
ACF-2023-0011-0718.html
ACF-2023-0011-0719.html
ACF-2023-0011-0720.html
ACF-2023-0011-0721.html
ACF-2023-0011-0722.html
ACF-2023-0011-0723.html
ACF-2023-0011-0724.html
ACF-2023-0011-0725.html
ACF-2023-0011-0726.html
ACF-2023-0011-0727.html
ACF-2023-0011-07

ACF-2023-0011-DRAFT-1014.html
ACF-2023-0011-DRAFT-1015.html
ACF-2023-0011-DRAFT-1016.html
ACF-2023-0011-DRAFT-1017.html
ACF-2023-0011-DRAFT-1018.html
ACF-2023-0011-DRAFT-1019.html
ACF-2023-0011-DRAFT-1020.html
ACF-2023-0011-DRAFT-1021.html
ACF-2023-0011-DRAFT-1022.html
ACF-2023-0011-DRAFT-1023.html
ACF-2023-0011-DRAFT-1024.html
ACF-2023-0011-DRAFT-1025.html
ACF-2023-0011-DRAFT-1026.html
ACF-2023-0011-DRAFT-1027.html
ACF-2023-0011-DRAFT-1028.html
ACF-2023-0011-DRAFT-1029.html
ACF-2023-0011-DRAFT-1030.html
ACF-2023-0011-DRAFT-1031.html
ACF-2023-0011-DRAFT-1032.html
ACF-2023-0011-DRAFT-1033.html
ACF-2023-0011-DRAFT-1034.html
ACF-2023-0011-DRAFT-1035.html
ACF-2023-0011-DRAFT-1036.html
ACF-2023-0011-DRAFT-1037.html
ACF-2023-0011-DRAFT-1038.html
ACF-2023-0011-DRAFT-1039.html
ACF-2023-0011-DRAFT-1040.html
ACF-2023-0011-DRAFT-1041.html
ACF-2023-0011-DRAFT-1042.html
ACF-2023-0011-DRAFT-1043.html
ACF-2023-0011-DRAFT-1044.html
ACF-2023-0011-DRAFT-1045.html
ACF-2023-0011-DRAFT-1046.html
ACF-2023-0

ACF-2023-0011-DRAFT-1290.html
ACF-2023-0011-DRAFT-1291.html
ACF-2023-0011-DRAFT-1292.html
ACF-2023-0011-DRAFT-1293.html
ACF-2023-0011-DRAFT-1294.html
ACF-2023-0011-DRAFT-1295.html
ACF-2023-0011-DRAFT-1296.html
ACF-2023-0011-DRAFT-1297.html
ACF-2023-0011-DRAFT-1298.html
ACF-2023-0011-DRAFT-1299.html
ACF-2023-0011-DRAFT-1300.html
ACF-2023-0011-DRAFT-1301.html
ACF-2023-0011-DRAFT-1302.html
ACF-2023-0011-DRAFT-1303.html
ACF-2023-0011-DRAFT-1304.html
ACF-2023-0011-DRAFT-1305.html
ACF-2023-0011-DRAFT-1306.html
ACF-2023-0011-DRAFT-1307.html
ACF-2023-0011-DRAFT-1308.html


Unnamed: 0,As of,Comment,Received,Status,Posted,Tracking No.,Comments Due,Submission Type,Docket,Comment On,...,Phone,Organization,attachment_name,attachment_text,attachment_has_tables,attachment_error,Comment_clean,attachment_text_clean,head_start_commenter,congress_commenter
0,"January 20, 2024",While I am in support of increased wages and b...,"November 21, 2023",Posted,"November 28, 2023",lp8-i19p-nqbk,"January 19, 2024",API,ACF-2023-0011,ACF-2023-0011-0001,...,,,,,,,While I am in support of increased wages and b...,,,False
1,"January 20, 2024",-First while I commend the need for HS staff t...,"November 22, 2023",Posted,"November 28, 2023",lpa-9kok-o7f7,"January 19, 2024",Web,ACF-2023-0011,ACF-2023-0011-0001,...,,,,,,,-First while I commend the need for HS staff t...,,,False
2,"January 20, 2024",Head Start teachers should be salaried. We wor...,"November 23, 2023",Posted,"November 28, 2023",lpb-j5uy-g8uv,"January 19, 2024",API,ACF-2023-0011,ACF-2023-0011-0001,...,,,,,,,Head Start teachers should be salaried. We wor...,,,False
3,"January 20, 2024",We want to get PAID!!!! We work hard for the l...,"November 21, 2023",Posted,"November 28, 2023",lp8-s4af-cz42,"January 19, 2024",Web,ACF-2023-0011,ACF-2023-0011-0001,...,8475325635.0,,,,,,We want to get PAID!!!! We work hard for the l...,,,False
4,"January 20, 2024",Are you going to address increased funding to ...,"November 22, 2023",Posted,"November 28, 2023",lp9-wsu0-lx3k,"January 19, 2024",API,ACF-2023-0011,ACF-2023-0011-0001,...,,,,,,,Are you going to address increased funding to ...,,,False


In [227]:
dff.loc[~dff['Government Agency'].isna(), 'Government Agency'] = dff.loc[~dff['Government Agency'].isna(), 'Government Agency'].apply(lambda x: x[1:])

In [228]:
dff.loc[dff['Government Agency'].isin(["Senate HELP Committee, Chair Sanders",'U.S. House Committee on Education and the Workforce','Office of Congressman Adam B. Schiff']),'congress_commenter'] = True

In [229]:
dff[(dff['head_start_commenter']==1) | (dff['congress_commenter']==1)].to_excel('important_commenter.xlsx')

In [96]:
# single out importanta commenter and put html and attachment into a folder
import pandas as pd
import shutil
import os
import glob
dff1 = dff[(dff['head_start_commenter']==1) | (dff['congress_commenter']==1)]
file_source = 'C:\\Users\\HaoLuo\\ACF-2023-0011\\ACF-2023-0011'
file_destination = 'C:\\Users\\HaoLuo\\ACF-2023-0011\\important_commenter\\'
shutil.copy2('important_commenter.xlsx', file_destination)

os.chdir(file_source)
directory = glob.glob('*')
htm = [x+'.html' for x in dff1['Document']]

for file in directory:
    if file in dff1['attachment_name'].tolist():
        shutil.copy2(file_source+'\\'+file, file_destination)
    if file in htm:
        shutil.copy2(file_source+'\\'+file, file_destination)    

In [230]:
dff['Document']=np.where((dff['attachment_name'].apply(lambda x: str(x).split('.')[0])!='nan'),
dff['attachment_name'].apply(lambda x: str(x).split('.')[0]),
dff['Document'])

In [231]:
dff.to_pickle('2023_scrape.pkl')
print('Total Time consumed: ', time.time()-st)

Total Time consumed:  78.34600853919983
