# Import libraries

In [None]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup

In [None]:
# generates indeed url for specific job title and location
def get_url(position,location):
    template = 'https://in.indeed.com/jobs?q={}&l={}'
    url = template.format(position,location)
    return url

In [None]:
url = get_url('python','India')

# Extract raw html

In [None]:
response = requests.get(url)
response

In [None]:
soup = BeautifulSoup(response.text,'html.parser')

In [None]:
cards = soup.find_all('div','jobsearch-SerpJobCard')

In [None]:
len(cards)

# Prototype the model with a single record

In [None]:
card = cards[0] # single card

In [None]:
# get job title
atag = card.h2.a
job_title = atag.get('title')


In [None]:
# get job url
job_url = 'https://www.indeed.com'+atag.get('href')


In [None]:
# company name
company_name = card.find('span',{'class':'company'}).text.strip()
company_name

In [None]:
# location
location = card.find('div','location').text
location

In [None]:
# get salary, not all jobs prefer to put salary therefore we are using try/except
try:
    salary_range = card.find('span','salaryText').text.strip()
except AttributeError:
    salary_range = ''
salary_range

In [None]:
# job summary
job_summary = card.find('div','summary').text.strip()
job_summary

In [None]:
post_date = card.find('span','date').text.strip() # date when it was posted

In [None]:
# this function contains all the above code and returns the tuple for a card
def get_record(card):
    company_name = card.find('span',{'class':'company'}).text.strip()
    job_summary = card.find('div','summary').text.strip()
    post_date = card.find('span','date').text.strip()
    try:
        salary_range = card.find('span','salaryText').text.strip()
    except AttributeError:
        salary_range = ''
    
    try:
        location = card.find('div','location').text
    except AttributeError:
        location = ''
    atag = card.h2.a
    job_title = atag.get('title')
    job_url = 'https://www.indeed.com'+atag.get('href')
    
    record = (job_title,company_name,salary_range,location,job_summary,post_date,job_url)
    return record

In [None]:
# looping over all cards
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
records[0]

# getting on next page

In [None]:
# until now were on first page, now scraping all the pages that appear in search.
records = []
i=0
url = get_url('python','Baroda')
while True: 
    response = requests.get(url)
    soup = BeautifulSoup(response.text,'html.parser')
    cards = soup.find_all('div','jobsearch-SerpJobCard')
    

    for card in cards:
        record = get_record(card)
        records.append(record)
    try:
        # get link of next page from the array in page list  which takes us to the next page
        # returns none if there is no new page 
        url = 'https://www.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
    except AttributeError:
        # exit loop if no page available
        break

In [None]:
len(records)

# saving scapred data

In [None]:
# getting all functions together
# get url of indeed for specific position and location
def get_url(position,location):
    template = 'https://in.indeed.com/jobs?q={}&l={}'
    url = template.format(position,location)
    return url

# get single card data scraped
def get_record(card):
    """Scrape data from a single card on the page"""
    # availabel  in span tag with class name company
    company_name = card.find('span',{'class':'company'}).text.strip()
    # sacraped from div tag have class summary
    job_summary = card.find('div','summary').text.strip()
    # scraped from span tag have calss date
    post_date = card.find('span','date').text.strip()
    
    # span tag having class salaryText
    try:
        salary_range = card.find('span','salaryText').text.strip()
    except AttributeError:
        salary_range = ''
    # div tag having class location
    try:
        location = card.find('div','location').text
    except AttributeError:
        location = ''
    # getting a tag within h2 tag
    atag = card.h2.a
    # getting value of title in a tag of h2
    job_title = atag.get('title')
    # getting job url from same a tag
    job_url = 'https://www.indeed.com'+atag.get('href')
    
    # forming a tuple 
    record = (job_title,company_name,salary_range,location,job_summary,post_date,job_url)
    return record

def main(position,location):
    """Scrape all the pages that appear on searching and save data in csv file"""
    records = []
    # initial url
    url = get_url(position,location)
    # extract data
    while True:
        
        response = requests.get(url)
        
        # reads whole page
        soup = BeautifulSoup(response.text,'html.parser')
        # get all cards from div tag having class jobsearch-SerpJobCard
        cards = soup.find_all('div','jobsearch-SerpJobCard')
        
        # looping over all cards to extract each card data
        for card in cards:
            record = get_record(card)
            records.append(record)
        # getting next page link
        try:
            # get link of next page from the array in page list  which takes us to the next page
            # returns none if there is no new 
            url = 'https://www.indeed.com'+soup.find('a',{'aria-label':'Next'}).get('href')
        except AttributeError:
            break
    # saving data
    with open('results.csv','w',newline='',encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['job_title','company_name','salary_range','location','job_summary','post_date','job_url'])
        writer.writerows(records)
            

In [None]:
# running program
main('python','Baroda')