In [1]:
# use the import keyword to import pandas, requests, and bs4 modules
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# assign the NY WARN notice url to a variable
url = "https://labor.ny.gov/app/warn/"

In [3]:
# define headers
headers = {'accept-encoding': 'deflate', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}

In [4]:
# make a get request to the url using the requests library and assign the response to a variable called 'response'
response = requests.get(url, headers=headers)

In [5]:
# print out status code of response to confirm that your request worked
response.status_code

200

In [36]:
# parse the response text using Beautiful Soup's html parser and assign output to a variable called 'soup'
# response.text
type(response.text)

str

In [6]:
# scrape the first table on the page and assign it to a variable called 'table'
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
# get the dropdown menu
select = soup.find("select", id="warnYr")
select

<select id="warnYr" name="warnYr" onchange="pageRefresh()">
<option selected="" value="2020">2020</option>
<option value="2019">2019</option>
<option value="2018">2018</option>
<option value="2017">2017</option>
<option value="2016">2016</option>
<option value="2015">2015</option>
<option value="2014">2014</option>
<option value="2013">2013</option>
<option value="2012">2012</option>
</select>

In [9]:
years = select.find_all("option")
years

[<option selected="" value="2020">2020</option>,
 <option value="2019">2019</option>,
 <option value="2018">2018</option>,
 <option value="2017">2017</option>,
 <option value="2016">2016</option>,
 <option value="2015">2015</option>,
 <option value="2014">2014</option>,
 <option value="2013">2013</option>,
 <option value="2012">2012</option>]

In [10]:
years = [option.text for option in select.find_all("option")]
years

['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012']

In [74]:
# make an array called 'results'
results = []
control_numbers = set()
#loop through all years
for year in years:
    # form url for a specific year, eg https://labor.ny.gov/app/warn/default.asp?warnYr=2019
    year_url = f'{url}/default.asp?warnYr={year}'
    print(year_url)
    # make get request to url
    response = requests.get(year_url, headers=headers)
    # scrape the first table on the page and assign it to a variable called 'table'
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find("table")
    
    # get all the rows in the table — this is how many WARN notices there were in a specific year
    rows = table.find_all("tr")
    print(len(rows))
    
    # grab all rows from the table and assign to a variable called 'rows'
    # loop through the rows using a for loop. each row here is a company
    for row in rows:
        # grab the anchor tag (the link tag) in the row and then grab the href attribute from the tag
        a = row.find("a")['href']

        # concatenate the root url from above with this href attribute and assign to a variable called 'company_url'
        company_url = f'{url}{a}'
        print(company_url)

        # make a get request to the company url assign the response to a variable called 'company_response'
        company_response = requests.get(company_url, headers=headers)

        # parse the response text and assign output to a variable called 'company_soup'
        company_soup = BeautifulSoup(company_response.text, 'html.parser')

        # grab the first table on the page
        company_table = company_soup.find("table")

        

        # loop through all of the p tags
        paragraphs = company_table.find_all("p")
        paragraphs_iter = iter(paragraphs)
        skip = False
        for p in paragraphs_iter:
            # grab all of the values we want
            text = p.get_text('\n').replace('\xa0', '')
            if 'Date of Notice:' in text:
                notice_date = text.split(":")[1].strip().split('\n')[0].strip().replace(',', '').replace(';', '')
                #print(notice_date)
            elif 'Control Number:' in text:
                control_number = text.split(":")[1].strip()
                #print(control_number)
                if control_number in control_numbers:
                    print('repeated control number')
                    skip = True
                    break
                else:
                    control_numbers.add(control_number)
            elif 'Reason Stated for Filing:' in text:
                reason = text.split(":")[1].strip()
                #print(reason)
            elif 'Company:' in text:
                split_company = [x.strip() for x in text.split('\n')]
                if len(split_company) == 1:
                    #print("getting to 1")
                    next_text = next(paragraphs_iter)
                    #text = p.get_text('\n').replace('\xa0', '')
                    #print(next_text)
                    
                else: 
                    company = split_company[1]
                    address = ' '.join(split_company[2:])
#                 print(company)
#                 print(address)
#                 print('------------------------------------')
            elif 'County:' in text:
                county = f'{text.split(":")[1].strip().split("|")[0].strip()} County'
                #print(county)
            elif 'Phone:' in text:
                phone = text.split(":")[1].strip()
                #print(phone)
            elif 'Business Type:' in text:
                business_type = text.split(":")[1].strip().replace('Restaurants', 'Restaurant')
                #print(business_type)
            elif 'Number Affected:' in text:
                if '-----' in text:
                    affected = ''
                else:
                    affected = text.split(":")[1].strip().split(" ")[0].strip().split('\n')[0].strip()
                #print(affected)
            elif 'Total Employees:' in text:
                if '-----' in text:
                    total_employees = ''
                else:
                    total_employees = text.split(":")[1].strip().split(" ")[0].strip().replace(',', '')
                    #print(total_employees)
            elif 'Layoff Date:' in text:
                #print(text)
                layoff_date = text.split(":")[1].strip().split(" ")[0].strip().split(" ")[0].strip()
                #print(layoff_date)
            elif ('Reason for Dislocation:' in text):
                dislocation = text.split(":")[1].strip()
                #print(dislocation)
            elif ('Union:' in text):
                union = text.split(":")[1].strip()
                #print(union)
            elif ('Classification:' in text):
                classification = text.split(":")[1].strip()
                #print(classification)

        # store values in a result object
        if not skip:
            result = {
                'notice_date': notice_date,
                'reason': reason,
                'company': company,
                'address': address,
                'county': county,
                'phone': phone,
                'business_type': business_type,
                'affected': affected,
                'total_employees': total_employees,
                'layoff_date': layoff_date,
                'dislocation': dislocation,
                'union': union,
                'classification': classification
             }
            # append result object to results
            results.append(result)
        #break
print(len(results))

https://labor.ny.gov/app/warn//default.asp?warnYr=2015
434
https://labor.ny.gov/app/warn/details.asp?id=5398
https://labor.ny.gov/app/warn/details.asp?id=5393
https://labor.ny.gov/app/warn/details.asp?id=5394
https://labor.ny.gov/app/warn/details.asp?id=5396
https://labor.ny.gov/app/warn/details.asp?id=5397
https://labor.ny.gov/app/warn/details.asp?id=5391
https://labor.ny.gov/app/warn/details.asp?id=5390
https://labor.ny.gov/app/warn/details.asp?id=5389
https://labor.ny.gov/app/warn/details.asp?id=5385
https://labor.ny.gov/app/warn/details.asp?id=5387
https://labor.ny.gov/app/warn/details.asp?id=5382
https://labor.ny.gov/app/warn/details.asp?id=5383
https://labor.ny.gov/app/warn/details.asp?id=5384
https://labor.ny.gov/app/warn/details.asp?id=5381
https://labor.ny.gov/app/warn/details.asp?id=5378
https://labor.ny.gov/app/warn/details.asp?id=5379
https://labor.ny.gov/app/warn/details.asp?id=5380
https://labor.ny.gov/app/warn/details.asp?id=5375
https://labor.ny.gov/app/warn/details.asp

In [None]:
# wrap results in a dataframe
df = pd.DataFrame(results)

In [103]:
pd.options.display.max_rows = 1237
df['layoff_date'].unique()

array(['3/19/2020', '5/29/2020', '6/1/2020', '3/18/2020', '3/30/2020',
       '3/12/2020', '3/27/2020', '4/24/2020', '4/16/2020', '3/20/2020',
       '4/3/2020', '6/30/2020', '4/23/2020', '3/16/2020', '3/17/2020',
       '4/4/2020', '4/8/2020', '4/22/2020', '3/24/2020', '4/13/2020',
       '4/10/2020', 'Furloughs', '3/22/2020', '3/23/2020', 'Separation',
       '3/29/2020', '3/25/2020', 'Separations', '32', 'Layoffs',
       '7/30/2020', '4/25/2020', '4/1/2020', '7/19/2020', '6/19/2020',
       '7/3/2020', '4/20/2020', '3/31/2020', '4/14/2020', '3/28/2020',
       '72', 'The', '51', '86', '88', '37', 'Nine', '4/6/2020',
       '4/30/2020', '(21)', '(27)', 'March', '30', '18', '14', 'Seven',
       '4/2/2020', '3/21/2020', '356', '34', '330', '69', '52', '87',
       '81', '-----', '4/7/2020', '48', '3/15/2020', '3/26/2020',
       '3/10/2020', '4/12/2020', '362', '5/1/2020', '3/13/2020',
       '4/5/2020', '3/19/2020,', '7/12/2020', '4/11/2020', '6/26/2020',
       '4/1/2020,', '7/1/20

In [106]:
# output dataframe to a csv
df.to_csv('../data/warn.csv', index=False)