<a href="https://colab.research.google.com/github/JacopoPassaro95/Python-Projects/blob/main/Scraper~tool_for_internship_offers_with_STRINGS_and_RE_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraper~tool for internship offers with strings and regular expressions
As an aspiring Data Scientist, I am interested in monitoring some Linkedin internships offers just to have an idea on what I am going to apply during next months.
###Let's try to do it in a funny automated way!



In [None]:
# Environment

import requests
import re
import random as rnd
import time
import pandas as pd

###Understanding Linkedin URL for job search
By directly formatting URL it is possible to retrieve the offer page without manipulating anything else

In [None]:
# How many offers are there?

results_pattern = '<div class="jobs-search-results-list__subtitle><span>(.*?)</span>'
results_match = re.search(results_pattern, lin_html).group().strip()
print(results_match)


### Automatic formatting function for Linkedin URL according to desired queries
This function fill Linkedin url with desired queries formatted in the correct way for internships research

In [None]:
def URLformat(keyword='', area='', job_type = ''):

  keyword = input("Please insert keywords for LinkedIn search: ")
  area = input("Where would you like to find this offer? ")
  job_type = input("Insert job type: F for full time, P for part-time, C for contractor, I for stages/internships ")

  # Error Message in case of missing query inputs and function immediate recall

  if keyword=='' or area=='' or job_type == '':

    print('Error: the words you have inserted are not correct, please try again')
    return  URLformat()

  # Then, replace function is used to fill
  # spaces with %20 and commas with %2C%20 thus formatting keyword and location inputs for the URL.
  # These replacements are necessary to ensure proper URL encoding

  search_url = keyword.replace(" ", "%20")

  if ", " in area:
    area_url = area.replace(", ", "%2C%20")
  else:
    area_url = area.replace(" ", "%20")

    # The linkedin_url variable ("f-string" form) contains variables of specified keywords
    # and location (as query parameters). This URL is so setted to perform a job search on LinkedIn.

  linkedin_url =f'https://www.linkedin.com/jobs/search?keywords={search_url}&location={area_url}&f_JT={job_type}&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'

  return linkedin_url

In [None]:
linkedin_url = URLformat()

In [None]:
print(linkedin_url)

### Scraping function for offer cards relevant information
Once finished its work, the function will return a built in dedicated dataset

In [None]:
def LINternScrape(url):

  #1. Get page and check for HTTP correct response
  lin_page = requests.get(url)

  print(lin_page.status_code,lin_page.headers['content-type'])
  if lin_page.status_code != 200:
    print("ERROR: Unsuccessfull HTTP response! Try again")
    return LINternScrape() # 200 value indicates Successfull response


  #2. Store page content in a string variable

  lin_html = lin_page.text

  #3. Bulding lists of all relevant information with regular expression

   #a Datetime
  dates = []
  dat_pattern ='<time class=".*?" datetime="(.*?)">'
  datetimes =  re.findall(dat_pattern, lin_html, re.DOTALL)

  # re.findall returns a list of tuples containing all the matched objects within html string

  for date in datetimes:
    dates.append(date)

   #b Offer Title
  titles = []
  offer_title_pattern = '<h3(.*?)>(.*?)</h3>'
  offer_titles = re.findall(offer_title_pattern, lin_html, re.DOTALL)

  # It is necessary to iterate tuples thus accessing and isolating the second element [1] that corresponds to offer title
  # Linkedin pages show 25 offers per time, then a while condition avoids addition of other confusing elements

  i = 0
  while offer_titles and i <= 24:
    offer_title = offer_titles[i][1].strip()  #Access through indexing tuple objects in a list
    titles.append(offer_title)
    i = i + 1

   #c Companies
  companies = []

  company_pattern = "<h4(.*?)>(.*?)</h4>"
  offer_companies = re.findall(company_pattern, lin_html, re.DOTALL)

  for company_match in offer_companies:
    company = company_match[1]
    company = re.sub("<.*?>", "", company).strip()
    companies.append(company)

   #d Path to offer page
  paths = []

  pattern = 'href="(https://it.linkedin.com/jobs/view/.*?)"'
  links = re.findall(pattern, lin_html, re.DOTALL)

  for path in links:
  #No need to manipulate string, just retrieve element
    paths.append(path)

  #4. Create a data frame of all scraped information

  Internships = pd.DataFrame({
    'Date': dates,
    'Company': companies,
    'Title': titles,
    'Link': paths
})

  return Internships

In [None]:
LINternScrape(linkedin_url)

### Scraping Descriptions from each Internship offer page with Descrape function
Once finished all descriptions will be added to Internships dataset

In [None]:
def Descrape():

  descriptions = []

  for link in Internships["Link"]:


    #offer_link = Internships["Link"][link]

    offer_page = requests.get(link)
    print(offer_page.status_code, offer_page.headers['content-type'], "Description correctly scraped!")

    offer_html = offer_page.text

    description_pattern = '<div class="show-more-less-html__markup (.*?)>(.*?)</div>'

  # re.search() scans all the string looking for the first location where pattern produces the first match returning a "match object"

    description_match = re.search(description_pattern, offer_html, re.DOTALL).group(2).strip()

  # re.sub() to remove all the tags and return only description text.

    description = re.sub("<.*?>", "", description_match)
    descriptions.append(description)

  # IMPORTANT : Avoid too many HTTP requests at the same time
  # Suspend running code for 6 plus 1 to 3 random seconds simulating human behaviour

    time.sleep(5+rnd.randint(1,3))


    #print(f'The description of the {Internships[link]} offer link has been retrieved! Please wait 6 and few more seconds for another ')

    #print(description)
  print("All descriptions have been stored, thank you for waiting !")

  # Add scraped description to Internships dataset
  Internships['Description'] = descriptions

  # Get final Internship dataset result
  return Internships


In [None]:
Descrape()

### In case you are interested in just one or some descriptions after a first look to the dataset

In [None]:
def OneDescrape(index = None):
  index = int(input("Please insert position index of the description you want to scrape: "))

  if index > len(Internships):
    print("ERROR! Please insert a number between 1 and 25 ")
    return OneDescrape(index)

  descr_page = requests.get(Internships["Link"][index])
  descr_html = descr_page.text

  descr_pat = '<div class="show-more-less-html__markup (.*?)>(.*?)</div>'

  # re.search() scans all the string looking for the first location where pattern produces the first match returning a "match object"

  descr_match = re.search(descr_pat, descr_html, re.DOTALL).group(2).strip()

  # re.sub() to remove all the tags and return only description text.

  description = re.sub("<.*?>", "", descr_match)

  return description


### Just looking at an interesting offer description

In [None]:
description_name = OneDescrape()
description_name