In [1]:
# !pip3 install html5lib

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import urllib.parse
from copy import copy

# Scrape index


In [3]:
# Define some functions to help scrape the index

def get_osha_url(query, start_date_str, end_date_str):
    
    results_per_page = 1000
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # make params dictionary
    base_url = f"https://www.osha.gov/ords/imis/establishment.search"
    params = {
        'establishment': '+'.join(query.split()),
        'state': "all",
        'officetype': "all",
        'office': "all",
        'sitezip': 100000,
        'startmonth': start_date.month,
        'startday': start_date.day,
        'startyear': start_date.year,
        'endmonth': end_date.month,
        'endday': end_date.day,
        'endyear': end_date.year,
        'p_case': "all",
        'p_sort': 12,
        'p_desc': "DESC",
        'p_direction': "Prev",
        'p_show': results_per_page,
        'p_violations_exist': "yes"
    }

    return base_url + '?' + urlencode(params)

def get_index_table(osha_url):
    return pd.read_html(osha_url)[2]
    

In [4]:
queries = [
    ('Dollar General', '2012-11-18', '2022-11-18'),
    ('Dollar Tree', '2012-11-18', '2022-11-18')
]

In [5]:
activities_df = []
for query in queries:
    
    # break the query into 3 variables, and pass those to get_osha_url
    establishment, start, end = query   
    osha_url = get_osha_url(establishment, start, end)
    print(osha_url)
    
    # get the index table from the osha url, drop a junk column    
    index_df_for_this_query = get_index_table(osha_url)
    index_df_for_this_query = index_df_for_this_query.drop(columns='Unnamed: 0')
    
    # tack on the query parameters to the table
    index_df_for_this_query['query'] = establishment
    index_df_for_this_query['query_start'] = start
    index_df_for_this_query['query_end'] = end
    
    # print to verify that pagination isn't a problem 
    # IMPORTANT: ðŸ‘€ look at these values to make sure we don't need to paginate     
    print(f"found {len(index_df_for_this_query)} results for {query}\n")

    # append to dataframe
    activities_df.append(index_df_for_this_query)
    
# combine the queries
activities_df = pd.concat(activities_df)
activities_df.to_csv('activities_df.csv', index=False)
activities_df

https://www.osha.gov/ords/imis/establishment.search?establishment=Dollar%2BGeneral&state=all&officetype=all&office=all&sitezip=100000&startmonth=11&startday=18&startyear=2012&endmonth=11&endday=18&endyear=2022&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 136 results for ('Dollar General', '2012-11-18', '2022-11-18')

https://www.osha.gov/ords/imis/establishment.search?establishment=Dollar%2BTree&state=all&officetype=all&office=all&sitezip=100000&startmonth=11&startday=18&startyear=2012&endmonth=11&endday=18&endyear=2022&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 377 results for ('Dollar Tree', '2012-11-18', '2022-11-18')



Unnamed: 0,#,Activity,Opened,RID,St,Type,Sc,SIC,NAICS,Vio,Establishment Name,query,query_start,query_end
0,1,1.617010e+06,08/25/2022,552652,MI,Complaint,Partial,,452319,1,Dollar General #17771,Dollar General,2012-11-18,2022-11-18
1,2,1.609906e+06,07/25/2022,454716,TN,Complaint,Partial,,452319,2,Dollar General Corporation,Dollar General,2012-11-18,2022-11-18
2,3,1.607451e+06,07/13/2022,418100,GA,Complaint,Partial,,445110,1,Dollar General Atlanta Fresh Distribution Center,Dollar General,2012-11-18,2022-11-18
3,4,1.599901e+06,06/03/2022,418600,FL,Complaint,Partial,,452319,3,"Dollar General Corporation/ Dolgencorp, Llc",Dollar General,2012-11-18,2022-11-18
4,5,1.598544e+06,05/24/2022,418200,GA,Complaint,Partial,,452319,7,"Dollar General Corporation/ Dolgencorp, Llc",Dollar General,2012-11-18,2022-11-18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,373,3.166403e+08,06/18/2013,1054113,OR,Planned,Complete,5331.0,452990,2,Dollar Tree Stores Inc,Dollar Tree,2012-11-18,2022-11-18
373,374,3.162124e+08,06/13/2013,950633,CA,Complaint,Partial,5331.0,452990,3,The Dollar Tree Store #3621,Dollar Tree,2012-11-18,2022-11-18
374,375,3.168030e+08,05/28/2013,1055340,WA,Referral,Partial,5399.0,452990,1,Dollar Tree Stores # 2264,Dollar Tree,2012-11-18,2022-11-18
375,376,3.166980e+08,04/16/2013,950624,CA,Complaint,Partial,5331.0,452990,2,"Dollar Tree Stores, Inc Dba Dollar Tree #4403",Dollar Tree,2012-11-18,2022-11-18


# Scrape individual activity pages

In [8]:
# Figure out how to scrape individual activity
def get_related_activity_table(url):
    tables = pd.read_html(url, match="Related Activity")
    return tables[0]

def get_violation_summary_table(url):
    tables = pd.read_html(url, match="Violation Summary")
    return tables[0]

def get_violation_items_table(url):
    tables = pd.read_html(url, match="Violation Items")
    return tables[0]

def get_investigated_inspection_table(url):
    tables = pd.read_html(url, match="Investigated Inspection")
    return tables[0]

def extract_key_value(tag):
    assert ':' in tag.text
    assert tag.find('strong')
    
    key = tag.text.split(':')[0].strip()
    value = tag.text.split(':')[1].strip()
    return key, value

def get_main_container(inspection_url):
    # get HTML from inspection page
    response = requests.get(inspection_url)
    html_doc = response.text
    soup = BeautifulSoup(html_doc, 'html.parser')

    # get main container (ignore footer/header/etc...)
    html_main_container = soup.find(id="maincontain")
    
    return html_main_container

def has_investigation_summary(html_main_container):
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text.strip() == "Investigation Summary":
            return True
    return False

def get_details(html_main_container):

    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_all_next():
                e.clear()
    
    # get details
    details = {}
    
    # get case status
    html_wells = html_main_container.find_all("div", class_="well")
    try:
        assert len(html_wells) == 2
        assert html_wells[0] == html_wells[1]
        case_status = html_wells[0]
        key, value = extract_key_value(case_status)                
    except:
        key = 'Case Status'
        value = 'ERROR'
        print("ERROR - couldn't scrape case status")
    details[key] = value
    
    # get remaining details
    html_spans = html_main_container.find_all("div", class_="span4")
    for span in html_spans:
        html_p_tags = span.find_all('p')
        if len(html_p_tags) == 0:
            columns_without_colon = 0
            key, value = extract_key_value(span)                
            details[key] = value
        else:
            for p_tag in html_p_tags:
                key, value = extract_key_value(p_tag)
                details[key] = value
    
    return details
    
def get_investigation_summary_details(html_main_container):
    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = copy(html_main_container).find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_previous_siblings():
                e.decompose()

    details = {}

    spans = html_main_container.find_all('div', class_='span4')
    p_tags = html_main_container.find_all('p')
    spans_and_ptags = spans + p_tags
    
    details = {}
    notes_columns = 0
    for tag in spans_and_ptags:
        if ':' in tag.text and tag.find('strong'):
            key, value = extract_key_value(tag)
        else:
            notes_columns += 1
            key = f"investigation_summary_notes_{notes_columns}"
            value = tag.text
        details[key] = value
    
    # add to key
    return(details)


In [9]:
violation_summary_tables = []
violation_items_tables = []
related_activity_tables = []
details_dictionaries = []
investigated_inspections_table = []

for index, row in activities_df.reset_index(drop=True).iterrows():
    activity_code = row['Activity']
    url = f"https://www.osha.gov/ords/imis/establishment.inspection_detail?id={activity_code}"
    print(f"{index} of {len(activities_df)} - scraping {url}")

    violation_summary = get_violation_summary_table(url)
    violation_summary['activity_code'] = activity_code
    violation_summary_tables.append(violation_summary)
    
    violation_items = get_violation_items_table(url)
    violation_items['activity_code'] = activity_code
    violation_items_tables.append(violation_items)

    try:
        related_activity = get_related_activity_table(url)
        related_activity['activity_code'] = activity_code
        related_activity_tables.append(related_activity)
    except:
        print("ERROR - related activity table wasn't scraped")
        
    
    html = get_main_container(url)
    details = {}
    
    if has_investigation_summary(html):
        print("Has investigation summary")
        investigation_summary_details = get_investigation_summary_details(html)
        details.update(investigation_summary_details)
        
        investigated_inspections = get_investigated_inspection_table(url)
        investigated_inspections['activity_code'] = activity_code
        investigated_inspections_table.append(investigated_inspections)
        

    details.update(get_details(html))
    details['activity_code'] = activity_code    
    details_dictionaries.append(details)


0 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1617010.015
1 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1609906.015
2 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1607451.015
3 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1599901.015
4 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1598544.015
5 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1596914.015
6 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1597065.015
7 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1593249.015
8 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1592570.015
9 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1592900.015
10 of 513 - scraping

81 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1226089.015
82 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1203374.015
83 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1201133.015
84 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1200685.015
85 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1195208.015
86 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1192554.015
87 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1184028.015
88 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1181678.015
89 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1180314.015
90 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1175610.015
91 of 513 

21 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1592254.015
22 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1592958.015
23 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1589562.015
24 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1590282.015
25 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1587972.015
26 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1586794.015
27 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1595108.015
28 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1580815.015
29 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1579408.015
30 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1577434.015
31 of 513 

103 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1448428.015
104 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1447228.015
105 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1447417.015
106 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1446800.015
107 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1446051.015
108 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1445218.015
109 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1443653.015
110 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1443300.015
111 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1443323.015
112 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1445436.015


185 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1350112.015
186 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1346746.015
187 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1342814.015
188 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1336248.015
189 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1334723.015
190 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1330484.015
191 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1330070.015
192 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1330308.015
193 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1327441.015
194 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1326715.015


261 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1206871.015
262 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1206086.015
263 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1204188.015
264 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1202268.015
ERROR - related activity table wasn't scraped
265 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1199921.015
266 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1199078.015
267 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1198899.015
268 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1195815.015
269 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1195034.015
ERROR - related activity table wasn't scraped
270 of 5

338 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317940203.0
ERROR - couldn't scrape case status
339 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1006796.015
340 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1005259.015
341 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317459295.0
ERROR - couldn't scrape case status
342 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317704070.0
ERROR - couldn't scrape case status
343 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317809044.0
ERROR - couldn't scrape case status
344 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=993458.015
345 of 513 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=992045.015
ERROR - related activity table wasn't scraped
346 of 513 -

In [10]:
violation_summary_df = pd.concat(violation_summary_tables)
violation_items_df = pd.concat(violation_items_tables)
related_activity_df = pd.concat(related_activity_tables)
investigated_inspections_df = pd.concat(investigated_inspections_table)
details_df = pd.DataFrame(details_dictionaries)

In [11]:
violation_summary_df.to_csv('violation_summary_df.csv', index=False)
violation_items_df.to_csv('violation_items_df.csv', index=False)
related_activity_df.to_csv('related_activity_df.csv', index=False)
investigated_inspections_df.to_csv('investigated_inspections_df.csv', index=False)
details_df.to_csv('details_df.csv', index=False)

In [13]:
!open .