In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [32]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 15)

## Web scrapping of the listed Python jobs in the first 10 pages of <https://www.timesjobs.com>

In [3]:
# Initialize an empty list to store all Python jobs
all_python_jobs = []

# Loop through the first 10 pages
for i in range(1, 11):
    # Update the URL to include the page number
    url = f'https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&searchTextSrc=&searchTextText=&txtKeywords=Python&txtLocation=&sequence={i}&startPage={i}'
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    job_list = soup.find('ul', class_='new-joblist')
    jobs = job_list.find_all('li', class_='clearfix job-bx wht-shd-bx')

    for job in jobs:
        company_name = job.find('h3', class_='joblist-comp-name').text.strip()
        key_skills = job.find('span', class_='srp-skills').text.replace('  ', '').strip()
        other_lists = job.find_all('li')  # Get all li tags within the job entry
        work_experience_card = other_lists[0].text.strip()  # Access the first li tag
        work_experience = work_experience_card[11:]
        location_card = other_lists[1].text.strip()  # Access the second li tag
        location = location_card[11:]
        job_description_card = other_lists[2].text.strip()  # Access the third li tag
        job_description = job_description_card[17:]
        all_python_jobs.append([company_name, key_skills, work_experience, job_description, location])

# Convert the list to a DataFrame
df = pd.DataFrame(all_python_jobs, columns=['company_name', 'key_skills', 'work_experience', 'job_description', 'location'])

In [4]:
df

Unnamed: 0,company_name,key_skills,work_experience,job_description,location
0,virtusa consulting services pvt. ltd.,"python development , object oriented programmi...",8 - 11 yrs,## Python - CREQ193122### DescriptionExtensiv...,\nPune
1,virtusa consulting services pvt. ltd.,"python programming , data analysis , data visu...",6 - 9 yrs,# Python - CREQ191001## Description- Develop ...,\nPune
2,virtusa consulting services pvt. ltd.,"python , javascript , sql , restful web servic...",3 - 5 yrs,## Python - CREQ191176**Description:**- Shoul...,
3,virtusa consulting services pvt. ltd.,"data analysis, communication, problem solving,...",5 - 8 yrs,# Python - CREQ193823- **Description** - 5...,\nPune
4,CONNECTING 2 WORK,"rest,python,django,mongodb",0 - 3 yrs,Job Description We are looking for candidates ...,\nCalicut/ Kozhikode
...,...,...,...,...,...
245,Intel Technology India Pvt Ltd,"python programming, linux/windows os, english ...",0 - 1 yrs,### We are looking for skilled intern to join ...,
246,Nutanix,"test automation , python programming , seleniu...",2 - 5 yrs,SDET - Python Automation EngineeringBangalore ...,\nBengaluru / Bangalore
247,IBM India Pvt Ltd,"backend services design , python programming ,...",4 - 8 yrs,# ## IntroductionSoftware Developers at IBM ar...,\nCochin/ Kochi/ Ernakulam
248,krazy mantra hr solutions,"python , django , oops , pl/sql , rest api ,ht...",1 - 9 yrs,Hiring For Python developer:Python developerDe...,"\nChennai, Bengaluru / Bangalore, Hyderabad/..."


## Data Cleaning and Processing

### Convert values in column 'company_name' to Proper case

In [6]:
df['company_name'] = df['company_name'].str.title()
df['company_name']

0      Virtusa Consulting Services Pvt. Ltd.
1      Virtusa Consulting Services Pvt. Ltd.
2      Virtusa Consulting Services Pvt. Ltd.
3      Virtusa Consulting Services Pvt. Ltd.
4                          Connecting 2 Work
                       ...                  
245           Intel Technology India Pvt Ltd
246                                  Nutanix
247                        Ibm India Pvt Ltd
248                Krazy Mantra Hr Solutions
249    Virtusa Consulting Services Pvt. Ltd.
Name: company_name, Length: 250, dtype: object

### Remove special characters from column key_skills and convert to Property case

In [9]:
df['key_skills'] = df['key_skills'].str.replace(r'[#!*]', ' ', regex=True).str.title()
df['key_skills']

0      Python Development , Object Oriented Programmi...
1      Python Programming , Data Analysis , Data Visu...
2      Python , Javascript , Sql , Restful Web Servic...
3      Data Analysis, Communication, Problem Solving,...
4                             Rest,Python,Django,Mongodb
                             ...                        
245    Python Programming, Linux/Windows Os, English ...
246    Test Automation , Python Programming , Seleniu...
247    Backend Services Design , Python Programming ,...
248    Python , Django , Oops , Pl/Sql , Rest Api ,Ht...
249      Python Scala , Data Processing , Hadoop , Py...
Name: key_skills, Length: 250, dtype: object

### Break 'work_experience' into two separate columns representing mininum and maximum experience requirement and convert to float

In [10]:
df['work_experience'].str.split(pat='-', n=1, expand=True)

Unnamed: 0,0,1
0,8,11 yrs
1,6,9 yrs
2,3,5 yrs
3,5,8 yrs
4,0,3 yrs
...,...,...
245,0,1 yrs
246,2,5 yrs
247,4,8 yrs
248,1,9 yrs


In [None]:
# Give names for new columns
df[['min_work_experience (yrs)', 'max_work_experience (yrs)']] = df['work_experience'].str.split(pat='-', n=1, expand=True)

In [15]:
# Remove'yrs' from 'max_work_experience'
df['max_work_experience (yrs)'] = df['max_work_experience (yrs)'].str.replace(' yrs', '')

df['max_work_experience (yrs)'] = df['max_work_experience (yrs)'].astype(float)
df['min_work_experience (yrs)'] = df['min_work_experience (yrs)'].astype(float)

In [16]:
df

Unnamed: 0,company_name,key_skills,work_experience,job_description,location,min_work_experience (yrs),max_work_experience (yrs)
0,Virtusa Consulting Services Pvt. Ltd.,"Python Development , Object Oriented Programmi...",8 - 11 yrs,## Python - CREQ193122### DescriptionExtensiv...,\nPune,8.0,11.0
1,Virtusa Consulting Services Pvt. Ltd.,"Python Programming , Data Analysis , Data Visu...",6 - 9 yrs,# Python - CREQ191001## Description- Develop ...,\nPune,6.0,9.0
2,Virtusa Consulting Services Pvt. Ltd.,"Python , Javascript , Sql , Restful Web Servic...",3 - 5 yrs,## Python - CREQ191176**Description:**- Shoul...,,3.0,5.0
3,Virtusa Consulting Services Pvt. Ltd.,"Data Analysis, Communication, Problem Solving,...",5 - 8 yrs,# Python - CREQ193823- **Description** - 5...,\nPune,5.0,8.0
4,Connecting 2 Work,"Rest,Python,Django,Mongodb",0 - 3 yrs,Job Description We are looking for candidates ...,\nCalicut/ Kozhikode,0.0,3.0
...,...,...,...,...,...,...,...
245,Intel Technology India Pvt Ltd,"Python Programming, Linux/Windows Os, English ...",0 - 1 yrs,### We are looking for skilled intern to join ...,,0.0,1.0
246,Nutanix,"Test Automation , Python Programming , Seleniu...",2 - 5 yrs,SDET - Python Automation EngineeringBangalore ...,\nBengaluru / Bangalore,2.0,5.0
247,Ibm India Pvt Ltd,"Backend Services Design , Python Programming ,...",4 - 8 yrs,# ## IntroductionSoftware Developers at IBM ar...,\nCochin/ Kochi/ Ernakulam,4.0,8.0
248,Krazy Mantra Hr Solutions,"Python , Django , Oops , Pl/Sql , Rest Api ,Ht...",1 - 9 yrs,Hiring For Python developer:Python developerDe...,"\nChennai, Bengaluru / Bangalore, Hyderabad/...",1.0,9.0


### Remove special characters from column 'job_description'

In [22]:
df['job_description'] = df['job_description'].str.replace(r'[#!*]', '', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()

df['job_description']

0      Python - CREQ193122 DescriptionExtensive pract...
1      Python - CREQ191001 Description- Develop , imp...
2      Python - CREQ191176Description:- Should have a...
3      Python - CREQ193823- Description - 5 years rel...
4      Job Description We are looking for candidates ...
                             ...                        
245    We are looking for skilled intern to join to I...
246    SDET - Python Automation EngineeringBangalore ...
247    IntroductionSoftware Developers at IBM are the...
248    Hiring For Python developer:Python developerDe...
249    Big Data Python Engineer - CREQ191701Descripti...
Name: job_description, Length: 250, dtype: object

### Remove '\n' from column 'location'. Break 'location' into multiple columns by ',' and replace '/' with 'or'

In [24]:
df['location'] = df['location'].str.replace('\n', ' ').str.replace('/', ' or ')

df['location']

0                                                   Pune
1                                                   Pune
2                                                       
3                                                   Pune
4                                  Calicut or  Kozhikode
                             ...                        
245                                                     
246                             Bengaluru  or  Bangalore
247                       Cochin or  Kochi or  Ernakulam
248     Chennai,  Bengaluru  or  Bangalore,  Hyderaba...
249                                                     
Name: location, Length: 250, dtype: object

In [25]:
# Split 'location' by ','
location_expanded = df['location'].str.split(', ', expand=True)

# Concatenate the original DataFrame with the expanded locations
df = pd.concat([df, location_expanded], axis=1)

# Rename the new location columns
num_location_cols = location_expanded.shape[1]
location_columns = [f'location_{i+1}' for i in range(num_location_cols)]
df.columns = list(df.columns[:-num_location_cols]) + location_columns

df

Unnamed: 0,company_name,key_skills,work_experience,job_description,location,min_work_experience (yrs),max_work_experience (yrs),location_1,location_2,location_3,location_4,location_5
0,Virtusa Consulting Services Pvt. Ltd.,"Python Development , Object Oriented Programmi...",8 - 11 yrs,Python - CREQ193122 DescriptionExtensive pract...,Pune,8.0,11.0,Pune,,,,
1,Virtusa Consulting Services Pvt. Ltd.,"Python Programming , Data Analysis , Data Visu...",6 - 9 yrs,"Python - CREQ191001 Description- Develop , imp...",Pune,6.0,9.0,Pune,,,,
2,Virtusa Consulting Services Pvt. Ltd.,"Python , Javascript , Sql , Restful Web Servic...",3 - 5 yrs,Python - CREQ191176Description:- Should have a...,,3.0,5.0,,,,,
3,Virtusa Consulting Services Pvt. Ltd.,"Data Analysis, Communication, Problem Solving,...",5 - 8 yrs,Python - CREQ193823- Description - 5 years rel...,Pune,5.0,8.0,Pune,,,,
4,Connecting 2 Work,"Rest,Python,Django,Mongodb",0 - 3 yrs,Job Description We are looking for candidates ...,Calicut or Kozhikode,0.0,3.0,Calicut or Kozhikode,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
245,Intel Technology India Pvt Ltd,"Python Programming, Linux/Windows Os, English ...",0 - 1 yrs,We are looking for skilled intern to join to I...,,0.0,1.0,,,,,
246,Nutanix,"Test Automation , Python Programming , Seleniu...",2 - 5 yrs,SDET - Python Automation EngineeringBangalore ...,Bengaluru or Bangalore,2.0,5.0,Bengaluru or Bangalore,,,,
247,Ibm India Pvt Ltd,"Backend Services Design , Python Programming ,...",4 - 8 yrs,IntroductionSoftware Developers at IBM are the...,Cochin or Kochi or Ernakulam,4.0,8.0,Cochin or Kochi or Ernakulam,,,,
248,Krazy Mantra Hr Solutions,"Python , Django , Oops , Pl/Sql , Rest Api ,Ht...",1 - 9 yrs,Hiring For Python developer:Python developerDe...,"Chennai, Bengaluru or Bangalore, Hyderaba...",1.0,9.0,Chennai,Bengaluru or Bangalore,Hyderabad or Secunderabad,,


In [33]:
# Remove 'None' values from dataframe

df = df.fillna('')
df

Unnamed: 0,company_name,key_skills,work_experience,job_description,location,min_work_experience (yrs),max_work_experience (yrs),location_1,location_2,location_3,location_4,location_5
0,Virtusa Consulting Services Pvt. Ltd.,"Python Development , Object Oriented Programmi...",8 - 11 yrs,Python - CREQ193122 DescriptionExtensive pract...,Pune,8.0,11.0,Pune,,,,
1,Virtusa Consulting Services Pvt. Ltd.,"Python Programming , Data Analysis , Data Visu...",6 - 9 yrs,"Python - CREQ191001 Description- Develop , imp...",Pune,6.0,9.0,Pune,,,,
2,Virtusa Consulting Services Pvt. Ltd.,"Python , Javascript , Sql , Restful Web Servic...",3 - 5 yrs,Python - CREQ191176Description:- Should have a...,,3.0,5.0,,,,,
3,Virtusa Consulting Services Pvt. Ltd.,"Data Analysis, Communication, Problem Solving,...",5 - 8 yrs,Python - CREQ193823- Description - 5 years rel...,Pune,5.0,8.0,Pune,,,,
4,Connecting 2 Work,"Rest,Python,Django,Mongodb",0 - 3 yrs,Job Description We are looking for candidates ...,Calicut or Kozhikode,0.0,3.0,Calicut or Kozhikode,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
245,Intel Technology India Pvt Ltd,"Python Programming, Linux/Windows Os, English ...",0 - 1 yrs,We are looking for skilled intern to join to I...,,0.0,1.0,,,,,
246,Nutanix,"Test Automation , Python Programming , Seleniu...",2 - 5 yrs,SDET - Python Automation EngineeringBangalore ...,Bengaluru or Bangalore,2.0,5.0,Bengaluru or Bangalore,,,,
247,Ibm India Pvt Ltd,"Backend Services Design , Python Programming ,...",4 - 8 yrs,IntroductionSoftware Developers at IBM are the...,Cochin or Kochi or Ernakulam,4.0,8.0,Cochin or Kochi or Ernakulam,,,,
248,Krazy Mantra Hr Solutions,"Python , Django , Oops , Pl/Sql , Rest Api ,Ht...",1 - 9 yrs,Hiring For Python developer:Python developerDe...,"Chennai, Bengaluru or Bangalore, Hyderaba...",1.0,9.0,Chennai,Bengaluru or Bangalore,Hyderabad or Secunderabad,,


### Remove unrequired columns

In [35]:
df = df.drop(columns = ['work_experience', 'location'])

df

Unnamed: 0,company_name,key_skills,job_description,min_work_experience (yrs),max_work_experience (yrs),location_1,location_2,location_3,location_4,location_5
0,Virtusa Consulting Services Pvt. Ltd.,"Python Development , Object Oriented Programmi...",Python - CREQ193122 DescriptionExtensive pract...,8.0,11.0,Pune,,,,
1,Virtusa Consulting Services Pvt. Ltd.,"Python Programming , Data Analysis , Data Visu...","Python - CREQ191001 Description- Develop , imp...",6.0,9.0,Pune,,,,
2,Virtusa Consulting Services Pvt. Ltd.,"Python , Javascript , Sql , Restful Web Servic...",Python - CREQ191176Description:- Should have a...,3.0,5.0,,,,,
3,Virtusa Consulting Services Pvt. Ltd.,"Data Analysis, Communication, Problem Solving,...",Python - CREQ193823- Description - 5 years rel...,5.0,8.0,Pune,,,,
4,Connecting 2 Work,"Rest,Python,Django,Mongodb",Job Description We are looking for candidates ...,0.0,3.0,Calicut or Kozhikode,,,,
...,...,...,...,...,...,...,...,...,...,...
245,Intel Technology India Pvt Ltd,"Python Programming, Linux/Windows Os, English ...",We are looking for skilled intern to join to I...,0.0,1.0,,,,,
246,Nutanix,"Test Automation , Python Programming , Seleniu...",SDET - Python Automation EngineeringBangalore ...,2.0,5.0,Bengaluru or Bangalore,,,,
247,Ibm India Pvt Ltd,"Backend Services Design , Python Programming ,...",IntroductionSoftware Developers at IBM are the...,4.0,8.0,Cochin or Kochi or Ernakulam,,,,
248,Krazy Mantra Hr Solutions,"Python , Django , Oops , Pl/Sql , Rest Api ,Ht...",Hiring For Python developer:Python developerDe...,1.0,9.0,Chennai,Bengaluru or Bangalore,Hyderabad or Secunderabad,,
