This file is to build our InternSg scrapper before modularising it into a python file for easier reference. I aim to build this scrapper such that it is able to scrape all active postings on InternSg and record it down.

In [11]:
# import modules 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [12]:
list_of_all_pages = []
range_of_pages = 80 # based on visual inspection
for i in range(1,range_of_pages):
    url = f'https://www.internsg.com/jobs/{i}/#isg-top'
    list_of_all_pages.append(url)

In [13]:
data = []
for url in list_of_all_pages:
    # get request to scrape url 
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # identify the div with ast-col-lg-1 as source for url and validity
    divs = soup.find_all('div', class_='ast-col-lg-1')

    # go through all possible listings and keep active listings only
    for div in divs:
        a_tag = div.find('a')

        # if a_tag is not found it is inactive listing so dont record
        if a_tag and 'href' in a_tag.attrs:
            url = a_tag['href']
            # print(a_tag['href'])
            date_span = a_tag.find('span', class_='text-monospace badge badge-success')
            if date_span:
                date = date_span.get_text().strip()
                # print(date)
                # store url and date information
                data.append({'URL': url, 'Date':date})

title_url_df = pd.DataFrame(data)
    

In [14]:
# identify information that is worth keeping 
col_names = ['Company','Designation','Date Listed','Job Type','Job Period','Profession',
             'Industry','Location Name','Allowance / Remuneration','Company Profile',
             'Job Description']
jobs_info = []

for url in title_url_df['URL']:
    # get request to scrape url 
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # initiate dict to store information
    dict = dict.fromkeys(col_names, '')

    for col_name in col_names:
        # Find the div that contains the column name
        col_div = soup.find('div', text=col_name, class_='font-weight-bold')
        if col_div:
            # The actual data is in the next sibling of the parent of col_div
            next_div = col_div.find_next_sibling()
            if next_div:
                # Extract the text and store it in the dictionary
                for span in next_div.find_all('span'):
                    span.decompose()
                dict[col_name] = next_div.get_text(strip=True)

    # store info inside jobs_info
    jobs_info.append(dict)

internSG_jobs = pd.DataFrame(jobs_info)

In [15]:
internSG_jobs.head()

Unnamed: 0,Company,Designation,Date Listed,Job Type,Job Period,Profession,Industry,Location Name,Allowance / Remuneration,Company Profile,Job Description
0,Tmr Media Pte Ltd,Digital Marketing Intern,31 May 2024,Entry Level / Junior Executive,"Immediate Start, For At Least 3 Months",Advertising / Media,Creative / Media,"60 Kaki Bukit Place, Singapore","$800 - 1,000 monthly",TMR Media is a Social Media focus media agency...,- Support and implement digital marketing acti...
1,Creative For More,Graphic Design & Illustration Intern,31 May 2024,"Entry Level / Junior Executive, Experienced / ...","From 01 Jun 2024, For At Least 3 Months",Design / Creative,Creative / Media,"Tanjong Katong Road, Singapore","$800 - 1,250 monthly",Graphic Design & Illustration InternCreative F...,What you will get to learnYou will get to unde...
2,I'm In,Creative Intern / Creative Executive,31 May 2024,"Entry Level / Junior Executive, Experienced / ...","Flexible Start, For At Least 6 Months",Design / Creative,Retail / eCommerce,Singapore,"$2,000 - 2,450 monthly",IMINXX.COM- I’M IN is a lingerie brand that de...,Role Description:The position will work in clo...
3,TwentyFour7,Blockchain Developer,31 May 2024,Experienced / Senior Executive,"Immediate Start, For At Least 6 Months",Engineering,Computer and IT,United States,$45 hourly,TwentyFour7 was founded on the principle that ...,We are seeking a skilled and experienced Block...
4,Trilogy Technologies Pte Ltd,Administrative Executive,31 May 2024,"Entry Level / Junior Executive, Experienced / ...","Immediate Start, Permanent",Engineering,Electronics,"Ang Mo Kio Avenue 5, Singapore","$2,500 - 4,500 monthly",Trilogy Technologies Pte Ltd is a Singapore ba...,We are looking for a dynamic and adaptive indi...


In [16]:
# store this as a file inside csv_files
internSG_jobs.to_csv('csv_files\_all_job_postings_.csv')