In [33]:
from bs4 import BeautifulSoup, SoupStrainer
from urllib.request import urlopen, urlretrieve
import re, json, os, string
import urllib
from urllib.error import URLError, HTTPError
from socket import timeout
from collections import OrderedDict
import pandas as pd

In [152]:
def save_data(data):
    directory = os.getcwd()
    files = os.listdir(directory)
    files = [i.replace(".json", "") for i in files if ".json" in files]
    title = data["title"].replace("/", "")
    if title in files:
        pass
    else:
        with open(os.path.join(directory, title+".json"), "w") as file:
            json.dump(data, file)
            print(title+" is saved")

In [136]:
#for scraping sections in table form with 2 columns

table2columns = ["Knowledge", "Skills", "Abilities", 
                 "WorkActivities", "Education", "Interests",
                 "WorkStyles", "WorkValues"
                ]

def makingTables(element, content_area):
    area = content_area.find("div", {"id":"wrapper_"+element})
    if area != None:
        element_list = []
        ths = area.findAll("th")
        rows = area.findAll("tr")
        for row in rows[1:]:
            tds = row.findAll("td")
            x = {ths[0].get_text(): tds[0].get_text().replace("\xa0", ""), 
                 ths[1].get_text(): tds[1].get_text().split(" — ")[0].replace("\xa0", ""),
                 "description" : tds[1].get_text().split("— ")[-1].replace("\n", "")
                }
            element_list.append(x)
        return(element_list)
    else:
        pass

In [137]:
#for sections in list forms with "-" joining the element and description

list2columns = ["TechnologySkills", "ToolsUsed"]

def makinglist(element, content_area):
    area = content_area.find("div", {"id":"wrapper_"+element})
    if area != None:
        element_list = []
        lis = area.findAll("li")
        for item in lis:
            sk = item.find("b").get_text()
            string = re.sub(r"\((.*?)\)", "", item.get_text())
            string = string.replace(sk, "").replace("—", "").replace("\n", "")
            string = string.split(";")
            string = [i.strip(" ") for i in string]
            x = {element: sk,
                  "details": string}
            element_list.append(x)
        return(element_list)
    else:
        pass

In [145]:
#wrapper functions to catch errors and retry scraping each JD
def scrape_page_wrapper(page):
    counter = 0
    tries = 5
    while counter < tries:
        try: 
            scrape_page(page)
            break
        except (URLError,
                HTTPError,
                timeout
               ) as e:
            print("****** ERROR on ", page)
            print("******* RETRY #", str(tries))
            tries -= 1

In [155]:
#scraping each JD
def scrape_page(page):
    page = page.replace("summary", "details")
    print(page)
    req = urllib.request.Request(page, headers={ 'User-Agent': 'Mozilla/5.0' })
    html = urllib.request.urlopen(req).read()
    soup=BeautifulSoup(html, 'html.parser')
    
    #opening the job page
    data = OrderedDict()
    #job title
    title = soup.find("span", {"class":"titleb"}).get_text()
    
    files = os.listdir(os.getcwd())
    files = [i.replace(".json", "") for i in files if i.endswith(".json")]
    if title not in files:
        pass
    else:
        return None
    
    content_area = soup.find("div", {"id":"content"})
    
    #tasks
    tasks = []
    tasks_area = soup.select("div.section_Tasks > div > table > tr")
    if len(tasks_area) > 0:
        for row in tasks_area[1:]:
            tds = row.findAll("td")
            x = {"task": tds[2].get_text().replace("\n", ""), 
                 "category": tds[1].get_text(),
                 "importance": tds[0].get_text()}
            tasks.append(x)
    else:
        return None
    
    #work context
    context = []
    context_area = soup.select("div.section_WorkContext > div > table > tr")
    if len(context_area) > 0:
        context_headers = context_area[0].findAll("th")
        context_headers = [i.get_text() for i in context_headers]
        for row in context_area[1:]:
            tds = row.findAll("td")
            subtable = tds[1].find("table")
            breakdown = []
            if subtable != None:
                subtable_rows = subtable.findAll("tr")
                for r in subtable_rows[1:]:
                    sub_tds = r.findAll("td")
                    percentage = sub_tds[0].find("b").get_text()
                    percentage_detail = sub_tds[1].get_text()
                    breakdown.append({"percentage":percentage+"%",
                                  "details":percentage_detail.replace("\xa0", "")})
            else:
                pass

            x = {context_headers[0]: tds[0].find("b").get_text(), 
                         "description": tds[0].get_text().split("— ")[-1].replace("\n", ""),
                         context_headers[1]: breakdown
                }
            context.append(x)
    else:
        pass
    
    detailedtask = soup.find("div", {"id":"wrapper_DetailedWorkActivities"})
    detailedtask_li = detailedtask.findAll("li")
    detailedtask_text = [i.get_text() for i in detailedtask_li]
    
    data["title"] = title
    #data["moc"] = moc
    #data["military_type"] = military

    for elm in table2columns:
        data[elm] = makingTables(elm, content_area)
    for elm in list2columns:
        data[elm] = makinglist(elm, content_area)

    if len(tasks) > 0:
        data["tasks"] = tasks
    if len(context) > 0:
        data["work_context"] = context
    data["detailed_work_activities"] = detailedtask_text
    
    save_data(data)

In [140]:
#from the main page with all jobs from all industries are listed
#it is in a table
main_page = "https://www.onetonline.org/find/industry?i=0&g=Go"
req = urllib.request.Request(main_page, headers={ 'User-Agent': 'Mozilla/5.0' })
html = urllib.request.urlopen(req).read()
soup=BeautifulSoup(html, 'html.parser')

rows = soup.select("td.report2ed > a ")

In [141]:
#getting links of each job
#we keep only the ones with starting with https
hrefs = [row["href"] for row in rows if row["href"].startswith("http")]

In [164]:
#if the link contains sub category job, then we click those links
for href in hrefs[638:]:
    page = href
    req = urllib.request.Request(page, headers={ 'User-Agent': 'Mozilla/5.0' })
    html = urllib.request.urlopen(req).read()
    soup=BeautifulSoup(html, 'html.parser')
    sub_cate = soup.find("div", {"class":"exclist"})
    if sub_cate != None:
        sub_occupation = soup.select("div.excitem > a")
        sub_jobs = [i["href"] for i in sub_occupation if i["href"].startswith("http")]
        for s in sub_jobs:
            scrape_page_wrapper(s)
    else:
        scrape_page_wrapper(href)            

https://www.onetonline.org/link/details/29-1066.00
https://www.onetonline.org/link/details/19-3039.01
https://www.onetonline.org/link/details/25-1066.00
25-1066.00 - Psychology Teachers, Postsecondary is saved
https://www.onetonline.org/link/details/27-3012.00
27-3012.00 - Public Address System and Other Announcers is saved
https://www.onetonline.org/link/details/11-2031.00
11-2031.00 - Public Relations and Fundraising Managers is saved
https://www.onetonline.org/link/details/27-3031.00
27-3031.00 - Public Relations Specialists is saved
https://www.onetonline.org/link/details/53-7072.00
53-7072.00 - Pump Operators, Except Wellhead Pumpers is saved
https://www.onetonline.org/link/details/13-1023.00
13-1023.00 - Purchasing Agents, Except Wholesale, Retail, and Farm Products is saved
https://www.onetonline.org/link/details/11-3061.00
11-3061.00 - Purchasing Managers is saved
https://www.onetonline.org/link/details/29-1124.00
29-1124.00 - Radiation Therapists is saved
https://www.onetonlin

11-9151.00 - Social and Community Service Managers is saved
https://www.onetonline.org/link/details/21-1093.00
21-1093.00 - Social and Human Service Assistants is saved
https://www.onetonline.org/link/details/19-4061.00
19-4061.00 - Social Science Research Assistants is saved
https://www.onetonline.org/link/details/25-1069.00
https://www.onetonline.org/link/details/19-3099.01
19-3099.01 - Transportation Planners is saved
https://www.onetonline.org/link/details/25-1113.00
25-1113.00 - Social Work Teachers, Postsecondary is saved
https://www.onetonline.org/link/details/21-1029.00
https://www.onetonline.org/link/details/19-3041.00
19-3041.00 - Sociologists is saved
https://www.onetonline.org/link/details/25-1067.00
25-1067.00 - Sociology Teachers, Postsecondary is saved
https://www.onetonline.org/link/details/15-1132.00
15-1132.00 - Software Developers, Applications is saved
https://www.onetonline.org/link/details/15-1133.00
15-1133.00 - Software Developers, Systems Software is saved
http

https://www.onetonline.org/link/details/41-3041.00
https://www.onetonline.org/link/details/39-7012.00
39-7012.00 - Travel Guides is saved
https://www.onetonline.org/link/details/37-3013.00
37-3013.00 - Tree Trimmers and Pruners is saved
https://www.onetonline.org/link/details/27-2023.00
27-2023.00 - Umpires, Referees, and Other Sports Officials is saved
https://www.onetonline.org/link/details/51-6093.00
51-6093.00 - Upholsterers is saved
https://www.onetonline.org/link/details/19-3051.00
19-3051.00 - Urban and Regional Planners is saved
https://www.onetonline.org/link/details/39-3031.00
39-3031.00 - Ushers, Lobby Attendants, and Ticket Takers is saved
https://www.onetonline.org/link/details/29-1131.00
29-1131.00 - Veterinarians is saved
https://www.onetonline.org/link/details/31-9096.00
31-9096.00 - Veterinary Assistants and Laboratory Animal Caretakers is saved
https://www.onetonline.org/link/details/29-2056.00
29-2056.00 - Veterinary Technologists and Technicians is saved
https://www

In [2]:
## scrapping from the MOS
militaries = ["F", #air force
              "A", #army
              "C", #coast guard
              "M", #marine corp
              "N"] #navy

militaries_dic = {"F" : "Air_Force",
                  "A" : "Army",
                  "C" : "Coast_Guard",
                  "M" : "Marine_Corp",
                  "N" : "Navy"
                }

search_terms = [i for i in range(1, 10)]+list(string.ascii_lowercase)

In [4]:
total_data = []
for m in militaries:
    for s in search_terms:
        mos_link = "https://www.onetonline.org/crosswalk/MOC?b={}&s={}&g=Go".format(m,s)
        print("\n", mos_link)
        req = urllib.request.Request(mos_link, headers={ 'User-Agent': 'Mozilla/5.0' })
        html = urllib.request.urlopen(req).read()
        soup = BeautifulSoup(html, 'html.parser')
        
        print("finding table")
        table = soup.find("table", {"class":"occ"})
        if table == None:
            print("****",mos_link, " no search result")
            pass
        else:
            rows = soup.select("table.occ > tr")

            for row in rows:
                moc = row.find("td", {"class":"occcodebold"}).get_text()
                moc_desc = row.find("td", {"class":"occtitlebold"}).get_text()
                subtable = row.findAll("a")
                hrefs = [tr["href"] for tr in subtable if tr["href"].startswith("http")]
                links = [i for i in hrefs if i.startswith("http")]

                data = OrderedDict()
                data["moc"] = moc
                data["military_kind"] = militaries_dic[m]
                data["moc_desc"] = moc_desc.split("\n")[0]
                data["jobs"] = links

                total_data.append(data)

with open(str(moc_onet)+".json", "w") as file:
    json.dump(total_data, file)


 https://www.onetonline.org/crosswalk/MOC?b=F&s=1&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=2&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=3&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=4&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=5&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=6&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=7&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=8&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=9&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=a&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=b&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=c&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=F&s=d&g=Go
finding table
**** https://www.onetonline.org/crosswalk/MOC?b=F&s=d&g=Go  no search result

 https://www

finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=b&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=c&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=d&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=e&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=f&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=g&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=h&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=i&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=C&s=j&g=Go
finding table
**** https://www.onetonline.org/crosswalk/MOC?b=C&s=j&g=Go  no search result

 https://www.onetonline.org/crosswalk/MOC?b=C&s=k&g=Go
finding table
**** https://www.onetonline.org/crosswalk/MOC?b=C&s=k&g=Go  no search result

 https://www.onetonline.org/crosswalk/MOC?b=C&s=l&g=Go
finding table
**** https://www.onetonline.org/crosswalk/MOC?b=C&s=l&g=Go  no

finding table

 https://www.onetonline.org/crosswalk/MOC?b=N&s=y&g=Go
finding table

 https://www.onetonline.org/crosswalk/MOC?b=N&s=z&g=Go
finding table
**** https://www.onetonline.org/crosswalk/MOC?b=N&s=z&g=Go  no search result


NameError: name 'moc_onet' is not defined

In [7]:
jobs = []
for total in total_data:
    for job in total["jobs"]:
        jobs.append(job)

jobs = list(set(jobs))      
    
    

In [30]:
new_list = []
for job in jobs:
    print(job)
    temp = OrderedDict()
    temp["job"] = job
    temp["moc"] = []
    for total in total_data:
        if job in total["jobs"]:
            temp["moc"] = temp["moc"]+[[total["moc"], 
                                       total["military_kind"], 
                                       total["moc_desc"]]]
        else:
            pass
    new_list.append(temp)

https://www.onetonline.org/link/summary/11-3071.01
https://www.onetonline.org/link/summary/13-1081.02
https://www.onetonline.org/link/summary/29-2011.02
https://www.onetonline.org/link/summary/51-9061.00
https://www.onetonline.org/link/summary/53-7199.00
https://www.onetonline.org/link/summary/53-4012.00
https://www.onetonline.org/link/summary/29-1065.00
https://www.onetonline.org/link/summary/53-6061.00
https://www.onetonline.org/link/summary/47-2132.00
https://www.onetonline.org/link/summary/15-1199.09
https://www.onetonline.org/link/summary/51-8099.04
https://www.onetonline.org/link/summary/49-9044.00
https://www.onetonline.org/link/summary/13-1151.00
https://www.onetonline.org/link/summary/53-6051.07
https://www.onetonline.org/link/summary/17-3023.01
https://www.onetonline.org/link/summary/43-3021.02
https://www.onetonline.org/link/summary/11-3051.00
https://www.onetonline.org/link/summary/29-1141.00
https://www.onetonline.org/link/summary/21-1093.00
https://www.onetonline.org/link

https://www.onetonline.org/link/summary/53-5031.00
https://www.onetonline.org/link/summary/29-1011.00
https://www.onetonline.org/link/summary/47-2073.00
https://www.onetonline.org/link/summary/25-9099.00
https://www.onetonline.org/link/summary/29-1141.01
https://www.onetonline.org/link/summary/15-2091.00
https://www.onetonline.org/link/summary/47-2031.01
https://www.onetonline.org/link/summary/43-4031.02
https://www.onetonline.org/link/summary/15-1199.04
https://www.onetonline.org/link/summary/51-9193.00
https://www.onetonline.org/link/summary/25-9041.00
https://www.onetonline.org/link/summary/17-2072.00
https://www.onetonline.org/link/summary/19-4091.00
https://www.onetonline.org/link/summary/31-2012.00
https://www.onetonline.org/link/summary/51-4193.00
https://www.onetonline.org/link/summary/13-1199.05
https://www.onetonline.org/link/summary/47-4061.00
https://www.onetonline.org/link/summary/51-2023.00
https://www.onetonline.org/link/summary/37-1012.00
https://www.onetonline.org/link

https://www.onetonline.org/link/summary/33-3051.03
https://www.onetonline.org/link/summary/27-2042.00
https://www.onetonline.org/link/summary/51-4011.00
https://www.onetonline.org/link/summary/11-9033.00
https://www.onetonline.org/link/summary/43-4199.00
https://www.onetonline.org/link/summary/49-9031.00
https://www.onetonline.org/link/summary/21-2011.00
https://www.onetonline.org/link/summary/53-3032.00
https://www.onetonline.org/link/summary/27-2012.04
https://www.onetonline.org/link/summary/29-2099.00
https://www.onetonline.org/link/summary/29-2099.05
https://www.onetonline.org/link/summary/45-3011.00
https://www.onetonline.org/link/summary/43-4051.00
https://www.onetonline.org/link/summary/51-9198.00
https://www.onetonline.org/link/summary/27-3091.00
https://www.onetonline.org/link/summary/51-9011.00
https://www.onetonline.org/link/summary/11-2011.00
https://www.onetonline.org/link/summary/11-3021.00
https://www.onetonline.org/link/summary/35-2021.00
https://www.onetonline.org/link

https://www.onetonline.org/link/summary/35-2013.00
https://www.onetonline.org/link/summary/35-2014.00
https://www.onetonline.org/link/summary/25-1051.00
https://www.onetonline.org/link/summary/19-3094.00
https://www.onetonline.org/link/summary/53-3021.00
https://www.onetonline.org/link/summary/29-9011.00
https://www.onetonline.org/link/summary/11-9041.00
https://www.onetonline.org/link/summary/27-4013.00
https://www.onetonline.org/link/summary/53-5011.00
https://www.onetonline.org/link/summary/21-2099.00
https://www.onetonline.org/link/summary/11-9081.00
https://www.onetonline.org/link/summary/51-6011.00
https://www.onetonline.org/link/summary/45-2021.00
https://www.onetonline.org/link/summary/17-2199.08
https://www.onetonline.org/link/summary/53-5022.00
https://www.onetonline.org/link/summary/39-9041.00
https://www.onetonline.org/link/summary/51-2021.00
https://www.onetonline.org/link/summary/17-3029.08
https://www.onetonline.org/link/summary/43-4051.03
https://www.onetonline.org/link

https://www.onetonline.org/link/summary/23-1022.00
https://www.onetonline.org/link/summary/19-3022.00
https://www.onetonline.org/link/summary/17-3031.01
https://www.onetonline.org/link/summary/33-9031.00
https://www.onetonline.org/link/summary/25-2022.00
https://www.onetonline.org/link/summary/49-9011.00
https://www.onetonline.org/link/summary/11-9151.00
https://www.onetonline.org/link/summary/51-8031.00
https://www.onetonline.org/link/summary/49-9043.00
https://www.onetonline.org/link/summary/49-2092.00
https://www.onetonline.org/link/summary/29-2057.00
https://www.onetonline.org/link/summary/15-1111.00
https://www.onetonline.org/link/summary/45-1011.07
https://www.onetonline.org/link/summary/51-5111.00
https://www.onetonline.org/link/summary/21-1091.00
https://www.onetonline.org/link/summary/53-3011.00
https://www.onetonline.org/link/summary/33-3021.03
https://www.onetonline.org/link/summary/51-9031.00
https://www.onetonline.org/link/summary/29-2041.00
https://www.onetonline.org/link

In [40]:
moc = []
military_kind = []
moc_desc = []
jobs = []

for total in new_list:
    moc.append(total["moc"])
    jobs.append(total["job"])
    
df = pd.DataFrame({"moc":moc, "jobs":jobs})



In [41]:
df.head(10)

Unnamed: 0,moc,jobs
0,"[[13M1M, Air_Force, Airfield Operations, RPA (...",https://www.onetonline.org/link/summary/11-307...
1,"[[20C0, Air_Force, Logistics Commander (Air Fo...",https://www.onetonline.org/link/summary/13-108...
2,"[[4T011, Air_Force, Medical Laboratory Helper ...",https://www.onetonline.org/link/summary/29-201...
3,"[[1A011, Air_Force, In-Flight Refueling Specia...",https://www.onetonline.org/link/summary/51-906...
4,"[[57H, Army, Cargo Specialist (Army - Enlisted...",https://www.onetonline.org/link/summary/53-719...
5,"[[88U, Army, Railway Specialist (USAR only) (A...",https://www.onetonline.org/link/summary/53-401...
6,"[[44F1A, Air_Force, Family Physician, Sports M...",https://www.onetonline.org/link/summary/29-106...
7,"[[1A611, Air_Force, Flight Attendant Helper (A...",https://www.onetonline.org/link/summary/53-606...
8,"[[MK, Coast_Guard, Machinery Technician (Coast...",https://www.onetonline.org/link/summary/47-213...
9,"[[2R011, Air_Force, Maintenance Management Ana...",https://www.onetonline.org/link/summary/15-119...


In [51]:
df2 = pd.DataFrame(df.moc.tolist(), index=df.jobs).stack().reset_index(level=1, drop=True)

In [68]:
df3 = pd.DataFrame(df2)
df3["moc"] = df3.iloc[:, 0].apply(lambda x: x[0])
df3["military_kind"] = df3.iloc[:, 0].apply(lambda x: x[1])
df3["moc_desc"] = df3.iloc[:, 0].apply(lambda x: x[2])

In [78]:
df3["job"] = df3.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [80]:
df3 = df3[["job", "moc", "moc_desc", "military_kind", 0]]

In [105]:
onet = os.listdir(os.path.join(os.getcwd(), "ONET"))

In [109]:
onet2 = ["-".join(i.split("-")[0:2]).strip() for i in onet]

In [119]:
jobcodes = [i[-1] for i in df3["job"].str.split("/")]

In [134]:
counter = 0
jobcodes = list(set(jobcodes))
for jobcode in jobcodes:
    if jobcode in onet2:
        pass
    else:
        print(jobcode)
        counter+=1
print(counter)

53-1021.01
19-3011.01
29-2011.02
55-3017.00
55-1016.00
25-9031.01
43-5081.00
33-2021.00
43-5011.01
31-9099.00
55-1012.00
51-9199.00
17-2081.01
11-9199.00
45-1011.00
33-3021.00
55-3012.00
55-3014.00
13-2099.00
55-2013.00
29-1122.01
19-3039.00
49-9099.00
43-4051.03
17-2051.01
49-3023.00
51-4121.00
55-3011.00
45-3011.00
17-3029.00
43-4031.00
15-1143.01
29-1125.02
11-9121.02
27-2042.00
17-2199.00
13-1199.00
13-2011.00
25-3099.00
43-4041.00
19-4011.00
55-2012.00
51-8099.00
15-1121.01
55-1011.00
11-3051.04
55-3016.00
15-1199.00
11-1011.00
11-3051.02
17-1022.01
29-1141.04
29-9099.00
29-1141.03
13-1041.00
19-2099.00
11-3051.06
55-3019.00
19-1029.00
15-2041.02
55-3018.00
49-2021.01
19-4051.00
33-3051.00
11-3031.00
55-1019.00
17-2072.01
33-9099.00
13-1081.01
39-9011.01
17-3024.01
55-3013.00
29-1141.01
19-1020.01
11-3051.01
17-2112.01
55-1014.00
17-3027.01
19-3099.00
53-5021.00
55-2011.00
43-3021.00
41-3099.00
17-3031.00
13-1081.02
29-1141.02
27-2041.00
47-1011.03
47-2031.00
19-4099.00
11-3071.00

In [56]:
        mos_link = "https://www.onetonline.org/crosswalk/MOC?b=F&s=1&g=Go"
        print(mos_link)
        #req = urllib.request.Request(mos_link, headers={ 'User-Agent': 'Mozilla/5.0' })
        html = urllib.request.urlopen(mos_link).read()
        soup = BeautifulSoup(html, 'html.parser')

        table = soup.select("table.occ > tr")
        
        if table == None:
            print(mos_link, " no search result")
        else:
            pass
        
        rows = soup.select("table.occ > tr")
        for row in rows[10:20]:
            print("iterating through rows")
            moc = row.find("td", {"class":"occcodebold"}).get_text()
            moc_desc = row.find("td", {"class":"occtitlebold"}).get_text()
            print(len(subtable))
            #hrefs = [tr["href"] for tr in subtable if tr["href"].startswith("http")]
            #links = [i for i in hrefs if i.startswith("http")]
            #print(link)



https://www.onetonline.org/crosswalk/MOC?b=F&s=1&g=Go
iterating through rows
1
iterating through rows
1
iterating through rows
1
iterating through rows
1
iterating through rows
1


In [None]:
len(table.findAll("td", {"class":"occcodebold"}))

In [37]:
len(table.findAll("tr"))

10161

In [60]:
        for row in rows[10:20]:
            print("iterating through rows")
            moc = row.find("td", {"class":"occcodebold"}).get_text()
            moc_desc = row.find("td", {"class":"occtitlebold"}).get_text()
            subtable = row.findAll("a")
            t = [i["href"] for i in subtable if i["href"].startswith("https")]
            print(len(t))

iterating through rows
1
iterating through rows
4
iterating through rows
2
iterating through rows
2
iterating through rows
2
iterating through rows
2
iterating through rows
2
iterating through rows
2
iterating through rows
2
iterating through rows
2
