In [None]:
from bs4 import BeautifulSoup
import urllib
import requests
import pandas as pd
from pprint import pprint
from google.colab import drive

In [None]:
drive.mount('/drive')

Mounted at /drive


In [None]:
def get_job_information(job_soup):
  
  title_soup = job_soup.find(class_='jobTitle')
  title = title_soup.text.strip()

  company_soup = job_soup.find(class_='companyName')
  company = company_soup.text.strip()

  location_soup = job_soup.find(class_='companyLocation')
  location = location_soup.text.strip()

  try:
    rating_soup = job_soup.find(class_='ratingNumber')
    rating = rating_soup.get("aria-label").split()[0]
  except: 
    rating = 'No reported rating'

  try:
    salary_soup = job_soup.find(class_ = 'estimated-salary')
    salary = salary_soup.text.strip()
  except:
    salary = 'No reported salary'

  link_soup = job_soup.find('a', class_='jcs-JobTitle')
  link = 'www.indeed.com' + link_soup.get("href") if link_soup.get("href")[0:4] == '/rc/' else 'www.indeed.com/viewjob?jk=' + link_soup.get('data-jk')

  new_job = {
      'Applied' : 'no',
      'Title' : title,
      'Company' : company,
      'Location' : location,
      'Rating' : rating,
      'Salary' : salary,
      'Link' : link
  }
  return new_job

In [None]:
def get_dates_posted(jobs_soup):
  date_soup = jobs_soup.find_all('span', class_="date")
  dates_posted = []
  for date in date_soup:
    stripped = date.text.strip()[6:-8].replace(" ", "")
    only_number = ""
    for char in stripped:
      if char.isdigit():
       only_number += char
    only_number += '+' if int(only_number) == 30 else ""
    dates_posted.append(only_number)
  #print(dates_posted)
  return dates_posted

In [None]:
def indeed_search(job_title, location, path):

  if job_title == "":
    job_title = "Data Scientist Intern"
  if location == "":
    location = "Boston, MA"

  columns = ["Applied", "Title", "Company", "Location", "Rating", "Salary", "Link"]
  jobs = pd.DataFrame(columns=columns)

  url_header = 'https://www.indeed.com/jobs?'
  parameters = urllib.parse.urlencode({
      'q' : job_title,
      'l' : location
  })
  print("Searching for jobs at the following link:", url_header + parameters)
  page = requests.get(url= url_header + parameters).content
  
  page_soup = BeautifulSoup(page, "html.parser")
  jobs_soup = page_soup.find(id="resultsCol")
  job_listings = jobs_soup.find_all(class_="resultContent")
  for job in job_listings:
    jobs.loc[len(jobs)] = get_job_information(job)
  
  jobs["Dates Posted"] = get_dates_posted(jobs_soup)
  

  folder_path = '/drive/My Drive/Colab Notebooks/Internship Scrapper/' + path
  try:
    pd.read_csv(folder_path).append(jobs).drop_duplicates().to_csv(folder_path, index=False)
    print("Putting " + str(len(jobs)) + " into the file " + path)
  except:
    print("Creating CSV")
    jobs.to_csv(folder_path, index = False)
  return jobs

In [None]:
job_title = input("What Job? ")
job_location = input("Location? ")
path = 'Internships.csv'
job_list = indeed_search(job_title, job_location, path)
print(job_list)

What Job? 
Location? 
Searching for jobs at the following link: https://www.indeed.com/jobs?q=Data+Scientist+Intern&l=Boston%2C+MA
Putting 14 into the file Internships.csv
   Applied                                              Title  \
0       no                           newData Scientist Intern   
1       no                         Lead Data Scientist Intern   
2       no                   Machine Learning Intern (Remote)   
3       no                                Data Science Intern   
4       no  Internship - Data Science and Machine Learning...   
5       no  Data Science & Artificial Intelligence (DxS) I...   
6       no         Graduate Student Intern - Machine Learning   
7       no                               AI Engineer (Intern)   
8       no          Summer Intern: Acorn AI Labs Data Science   
9       no    Computational Biology - Machine Learning Intern   
10      no  Research Intern - Machine Learning, Statistics...   
11      no  Internship: Data-Driven Optimization