In [1]:
import numpy as np
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

## NAIC Scraping

In [2]:
url = 'https://classcodes.com/naics-code-list/'
r = requests.get(url)

r.raise_for_status()

html = r.text

In [3]:
soup = BeautifulSoup(html, 'html.parser')
p = soup.find_all('p')

The information of SOC codes and titles is contained in the text of `<p>`：

In [4]:
[x.get_text() for x in p][17:27]

['1) NAICS 111110 Soybean Farming',
 '2) NAICS 111120 Oilseed (except Soybean) Farming',
 '3) NAICS 111130 Dry Pea and Bean Farming',
 '4) NAICS 111140 Wheat Farming',
 '5) NAICS 111150 Corn Farming',
 '6) NAICS 111160 Rice Farming',
 '7) NAICS 111191 Oilseed and Grain Combination Farming',
 '8) NAICS 111199 All Other Grain Farming',
 '9) NAICS 111211 Potato Farming',
 '10) NAICS 111219 Other Vegetable (except Potato) and Melon Farming']

We can use regular expressions to extract the contents, and store them in naic_df:

In [5]:
naic_df = pd.DataFrame(columns=['NAIC Code', 'NAIC Title'])

code_re = re.compile(r'NAICS \d{6} .*')
for item in tqdm_notebook(p):
    try:
        naic_result = code_re.search(item.get_text()).group()
        naic_code = naic_result.split()[1]
        naic_title = ' '.join(naic_result.split()[2:]).strip().replace('’','\'')
        naic_df = naic_df.append([{'NAIC Code':int(naic_code), 'NAIC Title':naic_title}],ignore_index=True)
    except:
        continue

HBox(children=(IntProgress(value=0, max=1117), HTML(value='')))




In [13]:
naic_df.head()

Unnamed: 0,NAIC Code,NAIC Title
0,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming
4,111150,Corn Farming


Now we can compare the data scraped from web with the file downloaded directly:

In [6]:
naic_dl_12 = pd.read_csv('dataset/6-digit_2012_Codes.csv')
naic_dl_12.columns = naic_df.columns
naic_dl_12.iloc[:,1] = naic_dl_12.iloc[:,1].apply(lambda x:x.strip().replace('’','\''))

After some naive parsing, we find that the scraping data are totally the same as the data in `6-digit_2012_Codes.csv`.

In [7]:
np.all(naic_df.values==naic_dl_12.values)

True

In [8]:
naic_df.to_pickle('dataset/naic.pkl')

## SOC Scraping

In [9]:
url = 'https://www.bls.gov/oes/current/oes_stru.htm'
r = requests.get(url)

r.raise_for_status()

html = r.text

In [10]:
soup = BeautifulSoup(html, 'html.parser')
li = soup.find_all('li')

The information of NAICS code and title is contained in the text of `<li>`：

Again, we can use regular expressions to extract the contents, and store them in soc_df:

In [11]:
soc_df = pd.DataFrame(columns=['SOC Code', 'SOC Title'])

code_re = re.compile(r'\d{2}-\d{4}\D+')
for a in tqdm_notebook(li):
    soc_result = code_re.findall(a.get_text())
    for item in soc_result:
        soc_code = item.split()[0]
        soc_title = ' '.join(item.split()[1:]).strip()
        if soc_code not in soc_df.iloc[:,0].values: # avoid duplicates
            soc_df = soc_df.append([{'SOC Code':soc_code, 'SOC Title':soc_title}],ignore_index=True)

HBox(children=(IntProgress(value=0, max=1668), HTML(value='')))




In [15]:
soc_df.head()

Unnamed: 0,SOC Code,SOC Title
0,00-0000,All Occupations
1,11-0000,Management Occupations
2,13-0000,Business and Financial Operations Occupations
3,15-0000,Computer and Mathematical Occupations
4,17-0000,Architecture and Engineering Occupations


In [12]:
soc_df.to_pickle('dataset/soc.pkl')