In [2]:
# Import the required libraries
import bs4
import urllib2
import pandas as pd
import math
import time
from pandas import DataFrame, Series
import matplotlib
%matplotlib inline

Read the page at the specified URL:

In [3]:
# Base URL 
base_url = 'http://www.naukri.com/machine-learning-jobs-'
source = urllib2.urlopen(base_url).read()

Create a soup object with 'lxml' as the parser. <br>
If the command below gives an error, try installing the following using command line: 
<br><code>sudo apt-get install libxslt1-dev libxml2</code>

In [4]:
soup = bs4.BeautifulSoup(source, "lxml")

Within each page, we now extract the urls that link to the corresponding job description: 

In [5]:
all_links = [link.get('href') for link in soup.findAll('a') if 'job-listings' in  str(link.get('href'))]
print "Sample job description link:",all_links[0]

Sample job description link: https://www.naukri.com/job-listings-Machine-Learning-Developer-Engineer-An-Artificial-Intelligence-enthusiast-from-Harvard-Business-School-Mumbai-2-to-4-years-221116000017?src=jobsearchDesk&sid=14802960493879&xp=1


Next, we scrape the job description for a sample job page in the list obtained above:

In [24]:
jd_url = all_links[2]
jd_source = urllib2.urlopen(jd_url).read()
jd_soup = bs4.BeautifulSoup(jd_source,"lxml")

Extract the individual job level attributes. The key here is to identify the correct tag and extract the text enclosed within those tags.

In [25]:
# Job Location
location = jd_soup.find("div",{"class":"loc"}).getText().strip()
print location

Mumbai


In [26]:
# Job Description
jd_text = jd_soup.find("ul",{"itemprop":"description"}).getText().strip()
print jd_text

We are looking for a machine learning scientist who can use their skills to research, build and implement solutions in the field of natural language processing, automated answers, semantic knowledge extraction from structured data and unstructured text. You should have a deep love for Machine Learning, Natural Language processing and a strong desire to solve challenging problems.  Responsibilities :  - Using NLP and machine learning techniques to create scalable solutions.  - Researching and coming up with novel approaches to solve real world problems.  - Working closely with the engineering teams to drive real-time model implementations and new feature creations.


In [27]:
# Experience Level
experience = jd_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
print experience

3 - 4 yrs


In [28]:
# Role Level Information
labels = ['Salary', 'Industry', 'Functional Area', 'Role Category', 'Design Role']
role_info = [content.getText().split(':')[-1].strip() for content in jd_soup.find("div",{"class":"jDisc mt20"}).contents 
 if len(str(content).replace(' ',''))!=0]

role_info_dict = {label: role_info for label, role_info in zip(labels, role_info)}
print role_info_dict

{'Salary': u'Not Disclosed by Recruiter', 'Functional Area': u'Analytics & Business Intelligence', 'Industry': u'IT-Software  /    Software Services', 'Role Category': u'Analytics & BI', 'Design Role': u'Data Analyst'}


In [29]:
# Skills required
key_skills = '|'.join(jd_soup.find("div",{"class":"ksTags"}).getText().split('  '))[1:]
print key_skills

Machine Learning|Natural Language Processing|NLP|Research|Statistical Models|Big data|Statistical Modeling 


In [30]:
# Education Level
edu_info = [content.getText().split(':') for content in jd_soup.find("div",{"itemprop":"educationRequirements"}).contents 
 if len(str(content).replace(' ',''))!=0]

edu_info_dict = {label.strip(): edu_info.strip() for label, edu_info in edu_info}

# Sometimes the education information for one of the degrees can be missing
edu_labels = ['UG', 'PG', 'Doctorate']
for l in edu_labels:
    if l not in edu_info_dict.keys():
        edu_info_dict[l] = ''
print edu_info_dict

{u'UG': u'Any Graduate - Any Specialization', u'Doctorate': u'Doctorate Not Required', 'PG': ''}


In [32]:
# Company Info
company_name = jd_soup.find("div",{"itemprop":"hiringOrganization"}).contents[1].p.getText()
print company_name

Premium-Jobs


Create a DF to store all the job level information:

In [101]:
naukri_df = pd.DataFrame()
column_names = ['Location', 'Link', 'Job Description', 'Experience','Salary', 'Industry', 'Functional Area', 'Role Category', 
                'Design Role', 'Skills','Company Name', 
                'UG','PG','Doctorate']

In [102]:
from collections import OrderedDict
df_dict = OrderedDict({'Location':location, 'Link':all_links[0],'Job Description':jd_text,'Experience':experience,
                       'Skills':key_skills,'Company Name':company_name})
df_dict.update(role_info_dict)
df_dict.update(edu_info_dict)
df_dict

OrderedDict([('Skills',
              u'Machine Learning|Natural Language Processing|NLP|Research|Statistical Models|Big data|Statistical Modeling '),
             ('Experience', u'3 - 4 yrs'),
             ('Job Description',
              u'We are looking for a machine learning scientist who can use their skills to research, build and implement solutions in the field of natural language processing, automated answers, semantic knowledge extraction from structured data and unstructured text. You should have a deep love for Machine Learning, Natural Language processing and a strong desire to solve challenging problems.  Responsibilities :  - Using NLP and machine learning techniques to create scalable solutions.  - Researching and coming up with novel approaches to solve real world problems.  - Working closely with the engineering teams to drive real-time model implementations and new feature creations.'),
             ('Link',
              'https://www.naukri.com/job-listings-Machine-

In [103]:
naukri_df = naukri_df.append(df_dict,ignore_index=True)
naukri_df

Unnamed: 0,Company Name,Design Role,Doctorate,Experience,Functional Area,Industry,Job Description,Link,Location,PG,Role Category,Salary,Skills,UG
0,Premium-Jobs,Data Analyst,Doctorate Not Required,3 - 4 yrs,Analytics & Business Intelligence,IT-Software / Software Services,We are looking for a machine learning scientis...,https://www.naukri.com/job-listings-Machine-Le...,Mumbai,,Analytics & BI,Not Disclosed by Recruiter,Machine Learning|Natural Language Processing|N...,Any Graduate - Any Specialization


In [104]:
# Reordering the columns to a preferred order as specified
naukri_df = naukri_df.reindex(columns=column_names)
naukri_df

Unnamed: 0,Location,Link,Job Description,Experience,Salary,Industry,Functional Area,Role Category,Design Role,Skills,Company Name,UG,PG,Doctorate
0,Mumbai,https://www.naukri.com/job-listings-Machine-Le...,We are looking for a machine learning scientis...,3 - 4 yrs,Not Disclosed by Recruiter,IT-Software / Software Services,Analytics & Business Intelligence,Analytics & BI,Data Analyst,Machine Learning|Natural Language Processing|N...,Premium-Jobs,Any Graduate - Any Specialization,,Doctorate Not Required


Next we put all of the above bits and pieces into 1 single function to extract info about all ML jobs on the site:

Lets first check the total number of machine learning jobs posted on the site. This information is present within the tag on the top of the page: <code> div class="count" </code>

In [33]:
print soup.find("div", { "class" : "count" }).h1.contents[1].getText()

1-50 of 2890


In [34]:
num_jobs = int(soup.find("div", { "class" : "count" }).h1.contents[1].getText().split(' ')[-1])
print num_jobs

2890


Each page has 50 job postings, which means that the total number of jobs can be calcuated as follows:

In [36]:
num_pages = int(math.ceil(num_jobs/50.0))
print "URL of the last page to be scraped:", base_url + str(num_pages)

URL of the last page to be scraped: http://www.naukri.com/machine-learning-jobs-58


In [None]:
# Together into one function
import bs4
import urllib2
import pandas as pd
import math
import time
from pandas import DataFrame
from collections import OrderedDict
import cPickle

# Base URL 
base_url = 'http://www.naukri.com/machine-learning-jobs-'
source = urllib2.urlopen(base_url).read()

soup = bs4.BeautifulSoup(source, "lxml")
num_jobs = int(soup.find("div", { "class" : "count" }).h1.contents[1].getText().split(' ')[-1])
num_pages = int(math.ceil(num_jobs/50.0))

# Together into one function
labels = ['Salary', 'Industry', 'Functional Area', 'Role Category', 'Design Role']
edu_labels = ['UG', 'PG', 'Doctorate']
naukri_df = pd.DataFrame()
           
for page in range(1,num_pages+1):
    page_url = base_url+str(page)
    source = urllib2.urlopen(page_url).read()
    soup = bs4.BeautifulSoup(source,"lxml")
    all_links = [link.get('href') for link in soup.findAll('a') if 'job-listings' in  str(link.get('href'))]
    for url in all_links:
        jd_source = urllib2.urlopen(url).read()
        jd_soup = bs4.BeautifulSoup(jd_source,"lxml")
        try:
            jd_text = jd_soup.find("ul",{"itemprop":"description"}).getText().strip()
            location = jd_soup.find("div",{"class":"loc"}).getText().strip()
            experience = jd_soup.find("span",{"itemprop":"experienceRequirements"}).getText().strip()
            
            role_info = [content.getText().split(':')[-1].strip() for content in jd_soup.find("div",{"class":"jDisc mt20"}).contents if len(str(content).replace(' ',''))!=0]
            role_info_dict = {label: role_info for label, role_info in zip(labels, role_info)}
            
            key_skills = '|'.join(jd_soup.find("div",{"class":"ksTags"}).getText().split('  '))[1:]

            edu_info = [content.getText().split(':') for content in jd_soup.find("div",{"itemprop":"educationRequirements"}).contents if len(str(content).replace(' ',''))!=0]
            edu_info_dict = {label.strip(): edu_info.strip() for label, edu_info in edu_info}
            for l in edu_labels:
                if l not in edu_info_dict.keys():
                    edu_info_dict[l] = ''

            company_name = jd_soup.find("div",{"itemprop":"hiringOrganization"}).contents[1].p.getText().strip()
        
        except AttributeError:
            continue
        df_dict = OrderedDict({'Location':location, 'Link':url,'Job Description':jd_text,'Experience':experience,'Skills':key_skills,'Company Name':company_name})
        df_dict.update(role_info_dict)
        df_dict.update(edu_info_dict)
        naukri_df = naukri_df.append(df_dict,ignore_index=True)
        time.sleep(1)
    print page
    

In order to work with this data at a later point in time, we could save it as a csv file. A better was to do this is to save the dataframe as a pickle object using cPickle library. By doing this we can store and retrieve the data as a Python object (i.e., Pandas dataframe). 

In [None]:
import cPickle
column_names = ['Location', 'Link', 'Job Description', 'Experience','Salary', 'Industry', 'Functional Area', 'Role Category', 
                'Design Role', 'Skills', 'Company Name',
                'UG','PG','Doctorate']

naukri_df = naukri_df.reindex(columns=column_names)        
with open('naukri_dataframe.pkl', 'wb') as f:
    cPickle.dump(naukri_df, f)            

In [108]:
with open('naukri_dataframe.pkl', 'r') as f:
    naukri_df = cPickle.load(f) 

Ideally we were supposed to have 2890 ML job postings. However, as shown below, we could only retrieve 1627 of them. The reason the remaining pages had to be skipped was that they would have a different HTML structure than the standard one on the website (the one which we had specified to extract the attributes). This is one of the disadvantages of scraping the web by parsing the elements of an HTML tree. Even if any one of the tag that we specified did not exist this would be noted as exception and we skip to the next iteration.

In [117]:
naukri_df.shape

(1627, 14)

In [118]:
naukri_df.head()

Unnamed: 0,Location,Link,Job Description,Experience,Salary,Industry,Functional Area,Role Category,Design Role,Skills,Company Name,UG,PG,Doctorate
0,Mumbai,https://www.naukri.com/job-listings-Machine-Le...,We are looking for a machine learning scientis...,3 - 4 yrs,Not Disclosed by Recruiter,IT-Software / Software Services,Analytics & Business Intelligence,Analytics & BI,Data Analyst,Machine Learning|Natural Language Processing|N...,Premium-Jobs,Any Graduate - Any Specialization,,Doctorate Not Required
1,Bengaluru,https://www.naukri.com/job-listings-Machine-Le...,Research & Development (R&D) Engineer Job des...,2 - 5 yrs,"INR 10,00,000 - 15,00,000 P.A",IT-Software / Software Services,"IT Software - Application Programming , ...",Programming & Design,Software Developer,Algorithms|Machine Learning|Python|Artificial ...,IPsoft Global Services Private Limited,B.Tech/B.E. - Computers,,
2,Bengaluru,https://www.naukri.com/job-listings-Software-E...,Work in cohesion with the R&D team towards bui...,1 - 5 yrs,Not Disclosed by Recruiter,IT-Software / Software Services,"IT Software - DBA , Datawarehousing",Programming & Design,Software Developer,R|D|Machine Learning|NLP|Software Engineering|...,Premium-Jobs,Any Graduate - Any Specialization,,Doctorate Not Required
3,Bengaluru,https://www.naukri.com/job-listings-Scala-Deve...,* I am looking to hire talented young develope...,3 - 4 yrs,Not Disclosed by Recruiter,Internet / Ecommerce,"IT Software - DBA , Datawarehousing",Programming & Design,Software Developer,Java|Hadoop|LISP|SCALA|Programming|Machine Lea...,Pin Click - Startup,B.Tech/B.E. - Any Specialization,,Doctorate Not Required
4,Bengaluru,https://www.naukri.com/job-listings-R-D-Engine...,Responsibilities: \tThe ideal candidate will ...,1 - 6 yrs,"INR 6,50,000 - 12,00,000 P.A",IT-Software / Software Services,"IT Software - Application Programming , ...",Programming & Design,Software Developer,Java|Maven|JUnit|Ant|Tomcat|Spring|Eclipse|JDB...,IPsoft Global Services Private Limited,"B.Tech/B.E. - Any Specialization, Computers, M...",Any Postgraduate,


In [None]:
s