In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt

def getSoup(url: str) -> BeautifulSoup:
    
    page = requests.get(url)
    bs = BeautifulSoup(page.content, "html.parser")
    page.close()
    
    return bs

In [2]:
url = "https://github.com/SimplifyJobs/Summer2024-Internships"

soup = getSoup(url)
print(soup.prettify())

<!DOCTYPE html>
<html data-a11y-animated-images="system" data-a11y-link-underlines="true" data-color-mode="auto" data-dark-theme="dark" data-light-theme="light" lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="https://github.githubassets.com" rel="dns-prefetch"/>
  <link href="https://avatars.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
  <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
  <link crossorigin="" href="https://github.githubassets.com" rel="preconnect"/>
  <link href="https://avatars.githubusercontent.com" rel="preconnect"/>
  <link crossorigin="anonymous" href="https://github.githubassets.com/assets/light-b92e9647318f.css" media="all" rel="stylesheet">
   <link crossorigin="anonymous" href="https://github.githubassets.com/assets/dark-5d486a4ede8e.css" media="all" rel="stylesheet">
    <link crossorigin="anonymous" data-color-theme="dark_dimmed" data-href="https://

In [3]:
table = soup.find('table')

In [4]:
def scrape_table(table): 
    return_list = []
    for row in table.tbody.find_all('tr'):    
        # Find all data for each column
        columns = row.find_all('td')
        
        if(columns != []):
            Company = columns[0].text.strip()
            Role = columns[1].text.strip()
            Location = columns[2].text.strip()
            a_tag = columns[3].find('a')
            Link = a_tag.get('href') if a_tag else None
            DatePosted = columns[4].text.strip()
            return_list.append((Company, Role, Location, Link, DatePosted))
    return return_list

data = scrape_table(table)
df1 = pd.DataFrame(data, columns=['Company', 'Role', 'Location', 'Link', 'DatePosted'])
df1.head()

Unnamed: 0,Company,Role,Location,Link,DatePosted
0,The New York Times,R&D Software Engineer Intern 🛂,NYC,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
1,↳,Android Engineer Intern 🛂,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
2,↳,Full-Stack Engineering Intern 🛂,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
3,↳,Newsroom Software Engineering Intern 🛂,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
4,↳,Data Science Intern 🛂,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18


In [5]:
for i in range(df1.shape[0]):
    if (df1.iloc[i, df1.columns.get_loc('Company')] == "↳"):
        df1.iloc[i, df1.columns.get_loc('Company')] = df1.iloc[i - 1, df1.columns.get_loc('Company')]
df1['Role'] = df1['Role'].str.replace("🛂", "")
df1['Role'] = df1['Role'].str.replace("🇺🇸", "")
df1['Role'] = df1['Role'].str.replace("Äì", "")
df1 = df1.dropna(subset = ['Link'])
rows_to_drop = df1[df1['Company'] == '↳'].index
df1 = df1.drop(rows_to_drop)
df1['Link'] = df1['Link'].str.replace('?utm_source=Simplify&ref=Simplify', "")
df1['Link'] = df1['Link'].str.replace('&utm_source=Simplify&ref=Simplify', "")

df1.head()

Unnamed: 0,Company,Role,Location,Link,DatePosted
0,The New York Times,R&D Software Engineer Intern,NYC,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
1,The New York Times,Android Engineer Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
2,The New York Times,Full-Stack Engineering Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
3,The New York Times,Newsroom Software Engineering Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
4,The New York Times,Data Science Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18


In [6]:
df1.to_csv(r'comprehensive_listing.csv', index=False, header=True)

In [7]:
old_df2 = pd.read_csv(r'data_science_listing.csv')

stop_company = old_df2.iloc[0, 0]
stop_role = old_df2.iloc[0, 1]
stop_location = old_df2.iloc[0, 2]
# we skip 'Link' because link might disappear because the job is no longer availale
stop_date = old_df2.iloc[0,4]

old_df2.head()

Unnamed: 0,Company,Role,Location,Link,DatePosted
0,Visa,Associate Data Scientist – Intern - Cybersecurity,"Ashburn, VA",https://jobs.smartrecruiters.com/Visa/74399994...,Nov 09
1,Fizz,Machine Learning Intern - Summer 2024,"Palo Alto, CA",https://jobs.lever.co/fizz/d3647110-e2c2-4422-...,Nov 09
2,Elevance Health,Data Analyst Internship - Summer 2024 - Underg...,"Chicago, IL",https://elevancehealth.wd1.myworkdayjobs.com/e...,Nov 09
3,Databricks,Genai Machine Learning Engineer Intern,SF,https://boards.greenhouse.io/embed/job_app?tok...,Nov 09
4,Zanbato,Data Engineering Intern Spring/Summer 2024,"Mountain View, CA",https://boards.greenhouse.io/zanbato/jobs/5486989,Nov 06


In [14]:
keyword1 = 'Data'
keyword2 = 'Machine'
keyword3 = 'Learning'
keyword4 = 'AI'

df2 = df1[df1['Role'].str.split().apply(lambda words: (keyword1 in words) or (keyword2 in words)
                                               or (keyword3 in words) or (keyword4 in words))]
df2.head()

Unnamed: 0,Company,Role,Location,Link,DatePosted
4,The New York Times,Data Science Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
5,The New York Times,Data Analyst Insights Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
8,Intuitive Surgical,Data Science Intern,"Sunnyvale, CA",https://careers.intuitive.com/en/jobs/74399994...,Nov 18
13,Denali Therapeutics,Data Science - Intern,"San Francisco, CA",https://boards.greenhouse.io/dnli/jobs/5513060...,Nov 18
33,Oracle,UX Data Science - Intern,United States,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,Nov 15


In [15]:
df2.to_csv(r'data_science_listing.csv', index=False, header=True)

In [16]:
df3 = pd.DataFrame(columns=df2.columns)

for index, row in df2.iterrows():
    if ((row[0] == stop_company) and (row[1] == stop_role) 
    and (row[2] == stop_location) and (row[4] == stop_date)):
        break
    df3.loc[index] = row

df3.head()

  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  if ((row[0] == stop_company) and (row[1] == stop_role)
  and (row[2] == stop_location) and (row[4] == stop_date)):


Unnamed: 0,Company,Role,Location,Link,DatePosted
4,The New York Times,Data Science Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
5,The New York Times,Data Analyst Insights Intern,NYCHybridRemote,https://boards.greenhouse.io/thenewyorktimes/j...,Nov 18
8,Intuitive Surgical,Data Science Intern,"Sunnyvale, CA",https://careers.intuitive.com/en/jobs/74399994...,Nov 18
13,Denali Therapeutics,Data Science - Intern,"San Francisco, CA",https://boards.greenhouse.io/dnli/jobs/5513060...,Nov 18
33,Oracle,UX Data Science - Intern,United States,https://eeho.fa.us2.oraclecloud.com/hcmUI/Cand...,Nov 15


In [17]:
df3.to_csv(r'positions_to_update.csv', index=False, header=True)

In [18]:
df_to_read = df3
with open('README.md', 'a') as file:
    for index, row in df_to_read.iterrows():
        line = f"| {row['Company']} | [{row['Role']}]({row['Link']}) | {row['Location']} | Bachelors, Masters |\n"
        file.write(line)