# Scraping Company Reviews (Glassdoor)

In [1]:
import requests
import html5lib
import lxml
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup

### Some websites have systems in place to prevent HTTP requests from being fulfilled. If possible, use the following code as a header argument in the request.get() function

- Use the following if the request status is 400 or 403

In [2]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

In [3]:
urlApple = 'https://www.glassdoor.sg/Reviews/Apple-Reviews-E1138.htm'
urlGoogle = 'https://www.glassdoor.sg/Reviews/Google-Reviews-E9079.htm'

In [4]:
req = requests.get(urlApple, headers=headers)
req

<Response [200]>

## Scraping reviews from the first page

In [5]:
# The following retrieves and display every single element in the html document
# Note, the retrieval of data from multiple pages requires > 1 HTTP request call to be made
html = BeautifulSoup(req.content, 'html.parser')

In [19]:
def reviewForAPage(html):
    pros = []
    cons = []
    reviews = html.find_all('div', class_='gdReview')
    for review in reviews:
        reviewBox = review.find_all('p', class_='mt-0 mb-0 pb v2__EIReviewDetailsV2__bodyColor v2__EIReviewDetailsV2__lineHeightLarge v2__EIReviewDetailsV2__isExpanded ')

        # Retrieving the pros
        pro_comment = reviewBox[0].select_one("span").text

        # Retrieving the cons
        cons_comment = reviewBox[1].select_one("span").text
        
        pros.append(pro_comment)
        cons.append(cons_comment)
    return pros, cons

In [20]:
proslst, conslst = reviewForAPage(html)

In [21]:
proslst

['-amazing benefits\r\n-friendly people\r\n-good salary\r\n-flexible hours\r\n-great for students',
 'nice good pay i love it',
 'Good salary and great working culture',
 'Everything is good Environmental friendly',
 'Love the benefits\r\ngreat feedback culture\r\nalot of social events',
 'They have a good policy and do not judge you for any problems you might have in your life that may disrupt your work, they are happy to give you time off to receover even its mentally which is not what every company does.',
 'Great company great boss b',
 'Good Flexi benefits and good on site gym',
 'Nice food, nice environment, very very good benefit',
 'Brand name, discount product, wfh, no phone call']

In [22]:
conslst

['hardly gets weekends off and would have to work late at night',
 'nothing much tbh i really like jt',
 'Nothing in particular i can think of',
 'No staff discount and not much sales',
 'low progression if youre in retail\r\nquite hectic and busy\r\nretail hours can be draining (esp weekends)',
 'Retail workers get paid lower than Corporate but they work really hard, but they do get benefits.\r\n\r\nScheduling can be irregular and for long-term work its not healthy.',
 'No cons all good g',
 'Pay increment low and career prospects needs to improve',
 'Stressful environment, hardly have time to rest',
 'Repetitive work, no career progress, mid schoold students can do type of job unfortunatly, quite depressing, moreover you get WP']

## We will now scrape all available reviews

On inspection, we can see that Apple has over 24K reviews across 2700 over pages. We will just scrape data up to page 2700

In [23]:
from threading import Thread
import httplib2, sys
from queue import Queue

concurrent = 10
reviewCorpus = []

q = Queue(concurrent * 2)

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

# Method 1:
def retrievalMethod1():
    def pageRetrieval(url):
        req = requests.get(url, headers=headers)
        html = BeautifulSoup(req.content, 'html.parser')
        reviewCorpus.append(html)

    def doWork():
        while True:
            url = q.get()
            pageRetrieval(url)
            q.task_done()

    def taskExecution():
        for i in tqdm(range(concurrent)):
            t = Thread(target=doWork)
            t.daemon = True
            t.start()
        try:
            for urlPage in range(1, concurrent + 1):
                url = f"https://www.glassdoor.sg/Reviews/Apple-Reviews-E1138_P{urlPage}.htm?filter.iso3Language=eng"
                q.put(url.strip())
            q.join()
        except KeyboardInterrupt:
            sys.exit(1)

In [24]:
maximum = 10
reviewCorpus = []

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

# Very inefficient!! Find a way to parallelise this process. Try Twisted
for currPage in tqdm(range(1, maximum + 1)):
    url = f'https://www.glassdoor.sg/Reviews/Apple-Reviews-E1138_P{currPage}.htm?filter.iso3Language=eng'
    req = requests.get(url, headers=headers)
    html = BeautifulSoup(req.content, 'html.parser')
    reviewCorpus.append(html)
    
prosList = []
consList = []

for htm in reviewCorpus:
    pros, cons = reviewForAPage(htm)
    prosList.extend(pros)
    consList.extend(cons)
    
# Put it in a pandas dataframe
dfReviews = pd.DataFrame()
dfReviews['Pro Reviews'] = prosList
dfReviews['Con Reviews'] = consList
dfReviews

100%|██████████| 10/10 [01:25<00:00,  8.58s/it]


Unnamed: 0,Pro Reviews,Con Reviews
0,-amazing benefits\r\n-friendly people\r\n-good...,hardly gets weekends off and would have to wor...
1,nice good pay i love it,nothing much tbh i really like jt
2,Good salary and great working culture,Nothing in particular i can think of
3,Everything is good Environmental friendly,No staff discount and not much sales
4,Love the benefits\r\ngreat feedback culture\r\...,low progression if youre in retail\r\nquite he...
...,...,...
95,Good benefit and great people from all walks o...,Asia department bosses micro manage. A manager...
96,"Working with very friendly staffs, and very ni...",Nothing I can think off at the moment. None No...
97,Good good love love love,Busy busy people oh people
98,"friendly managers and boss, good working location",average pay. work on weekend
