In [55]:
# use the import keyword to import pandas, requests, and bs4 modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [4]:
# assign the NY WARN notice url to a variable
url = "https://labor.ny.gov/app/warn/"

In [5]:
# define headers
headers = {'accept-encoding': 'deflate', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}

In [6]:
# make a get request to the url using the requests library and assign the response to a variable called 'response'
response = requests.get(url, headers=headers)

In [7]:
# print out status code of response to confirm that your request worked
response.status_code

200

In [8]:
# parse the response text using Beautiful Soup's html parser and assign output to a variable called 'soup'
# response.text
type(response.text)

str

In [9]:
# scrape the first table on the page and assign it to a variable called 'table'
soup = BeautifulSoup(response.text, 'html.parser')

In [10]:
# grab all rows from the table and assign to a variable called 'rows'
table = soup.find("table")

In [11]:
# print out the number of rows — this is how many WARN notices there were in 2020
rows = soup.find_all("tr")

In [12]:
len(rows)

1285

In [82]:
# make an array called 'results'
results = []
event_numbers = set()
# loop through the rows using a for loop. each row here is a company
for row in rows:
    # grab the anchor tag (the link tag) in the row and then grab the href attribute from the tag
    a = row.find("a")['href']
    
    # concatenate the root url from above with this href attribute and assign to a variable called 'company_url'
    company_url = f'{url}{a}'
    #company_url = 'https://labor.ny.gov/app/warn/details.asp?id=7341'
    #print(company_url)
    
    # make a get request to the company url assign the response to a variable called 'company_response'
    company_response = requests.get(company_url, headers=headers)
    
    # parse the response text and assign output to a variable called 'company_soup'
    company_soup = BeautifulSoup(company_response.text, 'html.parser')

    # grab the first table on the page
    company_table = company_soup.find("table")

    # unwrap all of the spans
    
    # loop through all of the p tags
    paragraphs = company_table.find_all("p")
    skip = False
    for p in paragraphs:
        # grab all of the values we want
        text = p.get_text('\n').replace('\xa0', '')
        if 'Date of Notice:' in text:
            split_notice_date = text.split(":")
            #print(split_notice_date)
            if len(split_notice_date) == 3:
                notice_date = split_notice_date[2].strip().split()[0].strip()
            else:
                notice_date = text.split(":")[1].strip().split()[0].strip().replace(',', '').replace(';', '')
            print(notice_date)
        elif 'Event Number:' in text:
            event_number = text.split(":")[1].strip()
            if event_number in event_numbers:
                print('repeated control number')
                print(company_url)
                skip = True
                break
            else:
                event_numbers.add(event_number)
        elif 'Reason Stated for Filing:' in text:
            reason = text.split(":")[1].strip()
            #print(reason)
        elif 'Company:' in text:
            split_company = [x.strip() for x in text.split('\n')]
            #print(split_company)
            company = split_company[1].strip()
            address = ''.join(split_company[2:])
#             print(company)
#             print(address)
        elif 'County:' in text:
            county = f'{text.split(":")[1].strip().split("|")[0].strip()} County'
            #print(county)
        elif 'Phone:' in text:
            phone = text.split(":")[1].strip()
            #print(phone)
        elif 'Business Type:' in text:
            business_type = text.split(":")[1].strip().replace('Restaurants', 'Restaurant')
            #print(business_type)
        elif 'Number Affected:' in text:
            affected = text.split(":")[1].strip().split(" ")[0].strip().split('\n')[0].strip().replace(',', '').replace('(', '')
            if affected in ['------', '-----', '----']:
                affected = 0
        elif 'Total Employees:' in text:
            total_employees = text.split(":")[1].strip().split(" ")[0].strip().replace(',', '')
            if total_employees in ['------', '-----', '----']:
                total_employees = 0
            #print(total_employees)
        elif 'Layoff Date:' in text:
            #print(text)
            layoff_date = text.split(":")[1].strip().split(" ")[0].strip().split(" ")[0].strip()
            #print(layoff_date)
        elif ('Reason for Dislocation:' in text):
            dislocation = text.split(":")[1].strip()
            #print(dislocation)
        elif ('Union:' in text):
            union = text.split(":")[1].strip()
            #print(union)
        elif ('Classification:' in text):
            classification = text.split(":")[1].strip()
            #print(classification)
            
    # store values in a result object
    if not skip:
        result = {
            'notice_date': notice_date,
            'event_number': event_number,
            'reason': reason,
            'company': company,
            'address': address,
            'county': county,
            'phone': phone,
            'business_type': business_type,
            'affected': affected,
            'total_employees': total_employees,
            'layoff_date': layoff_date,
            'dislocation': dislocation,
            'union': union,
            'classification': classification
         }

        # append result object to results
        results.append(result)
    #break

4/27/2020
4/27/2020
4/27/2020
4/27/2020
4/27/2020
4/27/2020
4/27/2020
4/27/2020
5/1/2020
4/30/2020
4/28/2020
4/28/2020
4/28/2020
4/28/2020
4/28/2020
4/6/2020
4/29/2020
4/28/2020
4/27/2020
4/6/2020
4/22/2020
5/1/2020
4/30/2020
4/6/2020
4/30/2020
4/6/2020
4/30/2020
4/23/2020
4/29/2020
4/15/2020
4/29/2020
4/24/2020
4/6/2020
4/29/2020
4/27/2020
4/29/2020
4/28/2020
4/28/2020
4/28/2020
4/28/2020
4/6/2020
4/29/2020
3/19/2020
4/20/2020
4/30/2020
4/30/2020
4/29/2020
4/17/2020
4/21/2020
4/27/2020
4/27/2020
3/18/2020
4/27/2020
4/27/2020
4/24/2020
4/22/2020
3/27/2020
3/27/2020
4/16/2020
4/16/2020
4/16/2020
4/24/2020
4/26/2020
4/23/2020
4/24/2020
4/20/2020
4/20/2020
3/29/2020
3/30/2020
3/30/2020
3/30/2020
3/30/2020
3/30/2020
4/21/2020
4/22/2020
4/15/2020
4/15/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/22/2020
4/23/2020
4/22/2020
4/17/2020
4/17/2020
4/20/2020
4/21/2020
4/21/2020
4/24/2020
4/21/2020
4/20/2020
3/27/2020
4/21/2020
4/23/2020
4/13/2020
4/20/202

In [83]:
# wrap results in a dataframe
df = pd.DataFrame(results)
df.shape

(1213, 14)

In [59]:
df['affected'].unique()

array(['174', '260', '239', '289', '154', '447', '393', '696', '274',
       '97', '93', '18', '169', '4', '52', '14', '53', '192', '38', '8',
       '21', '317', '39', '29', '196', '22', '36', '140', '113', '218',
       '131', '142', '35', '149', '359', '132', '209', '814', '222',
       '146', '1004', '689', '507', '44', '81', '64', '65', '54', '32',
       '178', '6', '107', '28', '70', '68', '25', '85', '99', '124', '24',
       '120', '84', '17', '60', '157', '75', nan, '91', '58', '15', '47',
       '49', '41', '210', '34', '74', '40', '16', '61', '62', '48', '456',
       '133', '2', '116', '119', '27', '23', '678', '459', '19', '66',
       '46', '33', '165', '429', '95', '104', '20', '397', '55', '115',
       '357', '67', '76', '110', '103', '3', '12', '26', '89', '1', '204',
       '7', '13', '185', '63', '545', '90', '280', '57', '9', '72', '5',
       '42', '87', '128', '101', '167', '77', '159', '247', '717', '251',
       '69', '10', '30', '50', '126', '78', '83', '86',

In [84]:
# output dataframe to a csv
df.to_csv('../data/warn.csv', index=False)