In [1]:
# WEB SCRAPING
import requests #for scrapping
from bs4 import BeautifulSoup #for scrapping

# GENERIC
import pandas as pd #data transformation
import itertools #combinations
import numpy as np
import tqdm
import matplotlib.pyplot as plt # for plotting
import os
import json

# Asynchronous / multithreaded requests
import multiprocessing
import multiprocessing.pool
import nest_asyncio
nest_asyncio.apply()
import asyncio
import aiohttp

# TEXT WRANGLING
import nltk # tokenize
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize # tokenize
from nltk.corpus import stopwords #stopwords
import string # for punctuation
from collections import Counter # to get frequency of words
from nltk.stem import WordNetLemmatizer # to lemmatize
from nltk.corpus import wordnet

# INITIALIZATION
nltk.download('omw-1.4')
directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)


/Users/konstantin/Documents/Projects/McGill/McGill-INSY-669-GroupProject


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/konstantin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# COMMENT PARSING
def parse_comment(soup, company):
    """returns a comment dictionary given a comment soup
    
    INPUTS: beautiful soup object containing a comment
    OUTPUT: dictionary with relevant comment fields
    """
    comment={}

    # review id
    comment['id'] = soup['id']

    # title of the review
    title_text = soup.find("a", class_="reviewLink")
    comment['Title_Review'] = title_text.text

    # stars
    stars_text = soup.find("span", attrs={"class": "ratingNumber mr-xsm"})
    comment['Stars'] = float(stars_text.text)

    # company name
    comment['Company_name'] = company

    # recommend
    elem = soup.find('div', class_="recommends").contents[0].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Recommend'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Recommend'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Recommend'] = 'neutral'
    else: comment['Recommend'] = 'error'

    # ceo approval
    elem = soup.find('div', class_="recommends").contents[1].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['CEO_approval'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['CEO_approval'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['CEO_approval'] = 'neutral'
    else: comment['CEO_approval'] = 'error'

    # business outlook
    elem = soup.find('div', class_="recommends").contents[2].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Business_outlook'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Business_outlook'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Business_outlook'] = 'neutral'
    else: comment['Business_outlook'] = 'error'

    # pros
    pros_text = soup.find("span", attrs={"data-test": "pros"})
    comment['Pros'] = pros_text.text

    # cons
    cons_text = soup.find("span", attrs={"data-test": "cons"}) 
    comment['Cons'] = cons_text.text

    # employee seniority
    seniority_text = soup.find("span", attrs={"class": "pt-xsm pt-md-0 css-1qxtz39 eg4psks0"})
    comment['Employee_seniority'] = seniority_text.text

    # location
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    location = elem.text.split('-')[1].strip().split(" ")
    loc_true = False
    for word in location:
        if '\xa0in' in word: 
            loc_true = True

    if loc_true: comment['Location'] = " ".join(location[-2:])
    else:  comment['Location'] = None

    # date
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    comment['Date'] = elem.text.split('-')[0].strip()

    return (comment)

In [3]:
# THIS MODULE DOWNLOADS & PROCESSES THE PAGES
# Main Function
def parse_page(p_n):
    """Pipeline to process a page"""

    # Downloads the Page
    page = requests.get(urls[p_n], headers)

    # Extracts Comments
    soup = BeautifulSoup(page.text, "html.parser")
    page_comments = soup.find_all('li', class_='empReview')
    
    # Cleans Comments
    page_comments_clean = []
    for comment in page_comments:
        page_comments_clean.append(parse_comment(comment, COMPANY))

    print(f"PAGE DONE: {p_n+1}, COMMENTS: {len(page_comments_clean)}")

    page_to_redo = []
    if len(page_comments_clean) == 0: page_to_redo.append(p_n)

    return(page_comments_clean, page_to_redo)


# Prepare Folders
COMPANY = "Amazon"
if not os.path.exists(os.path.join(directory, 'data', COMPANY)): os.mkdir(os.path.join(directory, 'data', COMPANY))

# EXECUTION
PAGES_TO_LOAD = 1000
headers = {'User-Agent': 'Mozilla/5.0'}
urls = ["https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P"+str(i+1)+".htm?filter.iso3Language=eng" for i in range(PAGES_TO_LOAD)]

pages = [p for p in range(PAGES_TO_LOAD)]
comments_clean = []
while len(pages) > 0:
    new_pages = pages.copy()
    pages = []

    with multiprocessing.pool.ThreadPool(multiprocessing.cpu_count()) as P:
        for clean_com, page_to_redo in P.map(parse_page, new_pages):
            comments_clean.extend(clean_com)
            pages.extend(page_to_redo)
    
    print(f"TO REDO {len(pages)}")

PAGE DONE: 129, COMMENTS: 0
PAGE DONE: 1, COMMENTS: 0
PAGE DONE: 225, COMMENTS: 0
PAGE DONE: 65, COMMENTS: 0
PAGE DONE: 161, COMMENTS: 0
PAGE DONE: 97, COMMENTS: 10
PAGE DONE: 2, COMMENTS: 10
PAGE DONE: 193, COMMENTS: 10
PAGE DONE: 130, COMMENTS: 10
PAGE DONE: 194, COMMENTS: 0
PAGE DONE: 131, COMMENTS: 0
PAGE DONE: 195, COMMENTS: 0
PAGE DONE: 132, COMMENTS: 0
PAGE DONE: 33, COMMENTS: 10
PAGE DONE: 162, COMMENTS: 10PAGE DONE: 133, COMMENTS: 0

PAGE DONE: 34, COMMENTS: 0
PAGE DONE: 134, COMMENTS: 0
PAGE DONE: 135, COMMENTS: 0
PAGE DONE: 226, COMMENTS: 10
PAGE DONE: 66, COMMENTS: 10
PAGE DONE: 227, COMMENTS: 0
PAGE DONE: 67, COMMENTS: 0
PAGE DONE: 3, COMMENTS: 10
PAGE DONE: 4, COMMENTS: 0
PAGE DONE: 98, COMMENTS: 10
PAGE DONE: 99, COMMENTS: 0
PAGE DONE: 100, COMMENTS: 0
PAGE DONE: 163, COMMENTS: 10
PAGE DONE: 136, COMMENTS: 10
PAGE DONE: 35, COMMENTS: 10
PAGE DONE: 228, COMMENTS: 10
PAGE DONE: 68, COMMENTS: 10
PAGE DONE: 229, COMMENTS: 0
PAGE DONE: 230, COMMENTS: 0
PAGE DONE: 231, COMMENT

In [5]:
# THIS MODULE SAVES THE CLEAN DATA
df_clean = pd.DataFrame.from_records(comments_clean)
df_clean.to_csv(f'data/{COMPANY}/{COMPANY}_comments.csv', index=False)

In [6]:
df_clean = pd.read_csv(f'data/{COMPANY}/{COMPANY}_comments.csv')
df_clean

Unnamed: 0,id,Title_Review,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Pros,Cons,Employee_seniority,Location,Date
0,empReview_73247758,Good impression in the first months,5.0,Amazon,positive,neutral,neutral,Documentation on Amazon is a super important p...,You need to understand your job and what you n...,"Current Employee, less than 1 year","Toronto, ON","Feb. 2, 2023"
1,empReview_73187609,intern,5.0,Amazon,neutral,neutral,neutral,4 days shifts is nice,long hour shift can make you feel tired,"Former Employee, less than 1 year","Toronto, ON","Jan. 31, 2023"
2,empReview_73188818,good,5.0,Amazon,positive,positive,positive,"great work balance, great environment, locatio...",workload can be heavy sometimes,"Former Employee, more than 1 year","Amazon, SK","Jan. 31, 2023"
3,empReview_73190433,job review,5.0,Amazon,positive,positive,positive,"good benefit, flexible time shift. take care o...","better organization of work, better car parkin...",Former Employee,,"Jan. 31, 2023"
4,empReview_73197210,Growth Opportunity,4.0,Amazon,positive,negative,negative,"Fast paced, Start-Up Culture, Benefits","Compensation, Growth Prospects, Development Op...","Current Employee, more than 3 years","Vancouver, BC","Jan. 31, 2023"
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,empReview_71536795,Great pay and onboarding,5.0,Amazon,positive,positive,positive,Amazon has wonderful search sites where you ca...,The interview process is very long but worth i...,"Current Employee, less than 1 year",,"Dec. 1, 2022"
9996,empReview_71537065,great comp,5.0,Amazon,neutral,neutral,neutral,great company easy to find an area you like,can get unlucky with team.,Current Employee,,"Dec. 1, 2022"
9997,empReview_71539933,"so far,so good!",5.0,Amazon,positive,positive,positive,"great teamwork, great working environment, per...",a little far from home,"Current Employee, less than 1 year","Querétaro, Querétaro","Dec. 1, 2022"
9998,empReview_71882994,Used to be a great company,2.0,Amazon,negative,negative,negative,You will become an excellent problem solver us...,"Cut-throat management and toxic culture, unnec...","Former Employee, more than 1 year","Diego, CA","Dec. 15, 2022"
