In [88]:
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import requests, re, os, path

In [106]:

def obterDadosBasicos(job_html):
    # job title
    title_html = job_html.find('h2').find('a', class_='s-link')
    job = {'title': title_html.text if title_html else None}

    # job company and location
    company, location = tuple(job_html.find('div', class_='-company').select('span'))
    job['company'], job['location'] = company.text, location.text

    # job posted
    job['posted'] = job_html.find('div', class_='-title').find('span', class_='pt2').text

    # request date
    job['request_date'] = datetime.now()

    # job salary and perks
    perks_html = job_html.find('div', class_='-perks')
    if perks_html:
        perks = []
        for p in perks_html.select('span'):

            if '-salary' in str(p):
                job['salary'] = p.text.strip()
            else:
                perks.append(p.text.strip())

        job['perks'] = ', '.join(perks)


    # job tags
    tags_html = job_html.find('div', class_='-tags')
    if tags_html:
        tags = [a.text for a in tags_html.select('a')]
        job['tags'] = ', '.join(tags)  
        
    return job


def get_detail(details, description):
    try:
        return details.find('span', text=description).find_next().getText()                    
    except: 
        return None
            
            
def obterDetalhes(job_html, job):
    link = job_html.find('h2', class_='fs-subheading').select_one('a')
    job['link']  = link['href']
    
    url = 'https://stackoverflow.com/{path}'.format(path= job['link'])
    text = fazer_requisicao(url)
    soup = BeautifulSoup(text, 'lxml')

    
    details = soup.find('div', class_='job-details--about')
    job['job-type'] = get_detail(details, 'Job type: ')
    job['experience-level'] = get_detail(details, 'Experience level: ')
    job['role'] = get_detail(details, 'Role: ')
    job['industry'] = get_detail(details, 'Industry: ')
    job['company-size'] = get_detail(details, 'Company size: ')
    job['company-type'] = get_detail(details, 'Company type: ')
    

    description = soup.find('h2', text='Job description')
    job['description'] = description.find_next('div').get_text() if description else None

    return job


def fazer_requisicao(url):
    filename_cache = 'cache/%s'%url.replace('/', '')
    
    if os.path.exists(filename_cache):
        with open(filename_cache, 'r') as f:
            return f.read()
    else:
        res = requests.get(url)
        res.raise_for_status()
        
        with open(filename_cache, 'w') as f:
            f.write(res.text)
            
    return res.text
    


def collect_jobs_stackoverflow():
    jobs = []    
    
    for pag in range(1, 43):
        try:
            url = 'https://stackoverflow.com/jobs?med=site-ui&ref=jobs-tab&sort=i&pg=%d'%pag
            text = fazer_requisicao(url)
            soup = BeautifulSoup(text, 'lxml')

            # Jobs list html
            jobs_html = soup.find('div', class_='listResults').findAll('div', class_='-job')

            for job_html in jobs_html:
                job = obterDadosBasicos(job_html)            
                job = obterDetalhes(job_html, job)

                jobs.append(job)

        except Exception as e:
            print(e, 'Página: ', pag)
            break
            
            
    return jobs

In [116]:
jobs = collect_jobs_stackoverflow()

In [109]:
#jobs

In [117]:
df = pd.DataFrame(jobs)

In [118]:
df.shape

(1050, 16)

In [119]:
df.head(10)

Unnamed: 0,company,company-size,company-type,description,experience-level,industry,job-type,link,location,perks,posted,request_date,role,salary,tags,title
0,Etiometry Inc.\n,11-50 people,Private,\nWe are currently seeking a Data Scientist to...,Mid-Level,"Data & Analytics, Healthcare, Medical Software",Full-time,/jobs/199604/data-scientist-etiometry-inc,"\n - \nBoston, MA",Paid relocation,3h ago,2019-01-07 22:51:19.033107,Data Scientist,,"python, medical, data-science",Data Scientist
1,The Real Real\n,1k-5k people,Private,\nThe RealReal is leading the way in authentic...,"Mid-Level, Senior, Lead","eCommerce, Fashion, Retail",Full-time,/jobs/199609/senior-platform-developer-backend...,"\n - \nSan Francisco, CA",,< 1h ago,2019-01-07 22:51:19.064124,Backend Developer,,"ruby-on-rails, elixir",Senior Platform Developer (Backend Developer)
2,Pear Therapeutics\n,51-200 people,Private,"\nAbout Pear Therapeutics \nAt Pear, our missi...","Mid-Level, Senior",Biotechnology,Full-time,/jobs/204842/senior-platform-engineer-pear-the...,"\n - \nSan Francisco, CA",,< 1h ago,2019-01-07 22:51:19.095266,Full Stack Developer,,"node.js, mysql, rest, ecmascript-6, javascript",Senior Platform Engineer
3,Integrated Data Services (IDS)\n,51-200 people,Private,"\nSenior Java Developer\nLos Angeles, CA; Wash...",Senior,"Financial Technology, Government, Software Dev...",Full-time,/jobs/157266/senior-java-developer-integrated-...,"\n - \nLos Angeles, CA",Remote,< 1h ago,2019-01-07 22:51:19.125580,Full Stack Developer,,"java-ee, spring, oracle, weblogic, jboss",Senior Java Developer
4,Integrated Data Services (IDS)\n,51-200 people,Private,"\nSenior Oracle PLSQL Developer\nLos Angeles, ...",Senior,"Financial Technology, Government, Software Dev...",Full-time,/jobs/141988/senior-oracle-plsql-developer-int...,"\n - \nLos Angeles, CA",Remote,< 1h ago,2019-01-07 22:51:19.160225,Database Administrator,,"plsql, oracle, sql, rdbms, database, sysadmin",Senior Oracle PLSQL Developer
5,Integrated Data Services (IDS)\n,51-200 people,Private,"\nSenior UI Developer\nLos Angeles, CA; Washin...",Senior,"Financial Technology, Government, Software Dev...",Full-time,/jobs/157265/senior-u-i-developer-integrated-d...,"\n - \nLos Angeles, CA",Remote,< 1h ago,2019-01-07 22:51:19.193965,"Frontend Developer, Designer",,"javascript, angularjs, html, ui-design, angular",Senior U/I Developer
6,Alegion\n,11-50 people,VC Funded,\nTHE OPPORTUNITY\nThe Applied Machine Learnin...,"Mid-Level, Senior","Artificial Intelligence, Computer Vision, Mach...",Full-time,/jobs/217831/applied-machine-learning-engineer...,"\n - \nAustin, TX",Paid relocation,< 1h ago,2019-01-07 22:51:19.229565,Full Stack Developer,Equity,"machine-learning, computer-vision, python, jav...",Applied Machine Learning Engineer
7,Hays\n,5k-10k people,Public,\nHays Specialist Recruitment is working in pa...,Mid-Level,"digital recruitment, IT Recruitment, Recruiting",Full-time,/jobs/173557/consulting-level-data-scientist-hays,"\n - \nRaleigh, NC",,1h ago,2019-01-07 22:51:19.258602,Data Scientist,,"r, python, spark, sql, git",Consulting Level Data Scientist
8,The Quantium Group\n,501-1k people,Private,\nFor over 15 years Quantium have combined the...,Lead,"Big Data, Data & Analytics, Data Science",Full-time,/jobs/204837/tech-focused-data-scientist-r-d-p...,"\n - \nSydney, Australia",Visa sponsor,2h ago,2019-01-07 22:51:19.290449,Data Scientist,,"scala, hadoop, go, r, apache-spark",Tech focused Data Scientist – R&D Platforms An...
9,Hays plc\n,5k-10k people,Public,\nHays have teamed up with NAB (National Austr...,"Senior, Lead","digital recruitment, IT Recruitment, Recruiting",Full-time,/jobs/201842/react-native-engineer-senior-lead...,"\n - \nSydney, Australia",,2h ago,2019-01-07 22:51:19.322576,Mobile Developer,A$10k - 160k,"react-native, reactjs, mobile, javascript, java","React Native Engineer, Senior / Lead"


In [120]:
df.to_csv('data/Stack_Overflow_Jobs.csv', index=False)