In [2]:
import requests
import requests_cache
import numpy as np
import pandas as pd
import lxml.html as lxl
import nltk
import re
from nltk.corpus import stopwords
from datetime import datetime

In [3]:
requests_cache.install_cache('demo_cache')

In [16]:
def urls_scraping(base_url, search_term, pages, location=''):
    urls = []
    
    for page in xrange(pages):
        try:
            response = requests.get(base_url, params={'searchterms': search_term,
                                                      'searchlocation': location,
                                                      'page': page+1})
            response.raise_for_status()
        except:
            break
    
        root = lxl.fromstring(response.content)
        url = ['https://www.cybercoders.com' + link for link in root.xpath('//div[@class="job-title"]/a/@href')]
        urls += url
    
    return urls
    
    
def page_scraping(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
    except:
        return
    
    root = lxl.fromstring(response.content)
    # extract title
    title = root.xpath('//div[@class="job-title"]/h1/text()')[0].strip()     
    
    # extract city and location of that posting.
    location = [i.strip() for i in root.xpath('//div[@class="location"]/span/text()')[0].split(',')]
    city = location[0]
    state = location[1]
    
    # extract job type and salary(to be done)
    salary = [int(i.replace('k', '000')) for i in re.findall(r'\d+k',root.xpath('//div[@class="wage"]/span/text()')[0] ) ] 
    if salary == []:
        salary_lower = 'NA'
        salary_upper = 'NA'
    else:
        salary_lower = salary[0]
        salary_upper = salary[1]
    
    # extract preferred skills(string type)
    skill_ls = root.xpath('//div[@class="skills"]/ul[@class="skill-list"]                         \
           /li[@class="skill-item"]/a/span[@class="skill-name"]/text()')
    preferred_skill = ', '.join([skill.strip() for skill in skill_ls])
    
    # extract 'what you need for this position'
    need_for_position = root.xpath('//div[@class="section-data section-data-title" and @data-section="7"]/text()')
    need_for_position = ' '.join(need_for_position)
    #extract job id
    job_id = root.xpath('//div[@class="job-id"]/text()')[0].strip()
    job_id = re.sub('.*: ','', job_id)  
    
    #extract post date
    post_date = root.xpath('//div[@class="mobile-hide posted-today posted-text" \
    or @class= "mobile-hide posted posted-text"]/span/text()')[0].strip()
 
    
    if post_date == 'Posted Today':
        post_date = datetime.today().strftime('%m/%d/%Y')
    else:
        post_date = re.sub(r'^Posted ([0-9]{2}/[0-9]{2}/[0-9]{4})', r'\1', post_date)
    
    return pd.Series({'city':city, 'state': state, 'salary_lower': salary_lower, 'salary_upper': salary_upper, 'preferred_skill':preferred_skill, 'need_for_position': need_for_position,
           'job_id': job_id, 'post_date': post_date})

def scraping(base_url, search_term, pages, location=''):
    urls = urls_scraping(base_url, search_term, pages, location='')
    scraping_data = [page_scraping(link) for link in urls]
    #scraping_data = []
    #for link in urls:
    #    print link
    #    scraping_data.append(page_scraping(link))
    return pd.concat(scraping_data, axis=1).T

In [18]:
data_scientist = scraping('https://www.cybercoders.com/search/', 'data scientist', 6)

In [25]:
data_scientist.head()
data_scientist.to_csv('data scientist.csv',encoding="utf-8")

Just for debugging

In [9]:
response = requests.get('https://www.cybercoders.com/data-scientist-job-267769')
response.raise_for_status()
root = lxl.fromstring(response.content)

In [13]:
root.xpath('//div[@class="mobile-hide posted-today posted-text"]/span/text()')[0]

'Posted Today'

In [14]:
a = root.xpath('//div[@class="mobile-hide posted-today posted-text" or @class= "mobile-hide posted posted-text"]/span/text()')[0].strip()

In [15]:
a

'Posted Today'

In [23]:
#pd.concat([pd.Series(a), pd.Series(b), pd.Series(a)], axis=1).T
#urls_scraping(base_url, search_term, pages, location='')
data_scientist.head(40)

Unnamed: 0,city,job_id,need_for_position,post_date,preferred_skill,salary_lower,salary_upper,state
0,Newton,BA-1277535,"- BS (min GPA 3.5) or MS or PhD in science, en...",02/23/2017,"Data Analytics, Informatics, Life Sciences . P...",100000.0,130000.0,MA
1,Sunnyvale,BF1-1327877,- Networking/Security - Experience with big d...,02/23/2017,"Python, C/C++, Networking, Security, Apache Sp...",150000.0,200000.0,CA
2,Mercer Island,RM2-1335019,More Than 3 Years of experience and knowledge ...,02/23/2017,"Big Data, Predictive Modeling, Algorithm Devel...",,,WA
3,Redwood City,AW2-1341356,Requirements: Bachelors in Computer Science or...,02/23/2017,"Machine Learning, Python, R, Mapreduce, Javasc...",140000.0,225000.0,CA
4,Portland,CS9-1346787,Experience and knowledge of: - Machine Learnin...,02/23/2017,"Machine Learning, Data Mining, Python, ETL BI,...",100000.0,120000.0,OR
5,Needham,PD2-1346845,- BS with a focus on life sciences. A degree i...,02/23/2017,"Data Analytics, Life Sciences, Pharmaceuticals...",100000.0,130000.0,MA
6,Seattle,RM2-1346990,More Than 3 Years of experience and knowledge ...,02/23/2017,"Big Data, Predictive Modeling, Algorithm Devel...",,,WA
7,Newport Beach,MM5-1350722,At Least 3 Years of experience and knowledge o...,02/20/2017,"Machine Learning, Python, Linux, Java, Scala, ...",,,CA
8,Newport Beach,MM5-1348187,At Least 3 Years of experience and knowledge o...,02/23/2017,"Machine Learning, Python, Linux, Java, Scala, ...",,,CA
9,Redwood City,BK-1353240,- Expertise with large scale data manipulation...,03/03/2017,"Machine Learning, Graph Analytics, Statistical...",150000.0,200000.0,CA
