In [1]:
# WEB SCRAPING
import requests #for scrapping
from bs4 import BeautifulSoup #for scrapping

# GENERIC
import pandas as pd #data transformation
import itertools #combinations
import numpy as np
import tqdm
import matplotlib.pyplot as plt # for plotting
import os
import json

# Asynchronous / multithreaded requests
import multiprocessing
import multiprocessing.pool

# TEXT WRANGLING
import nltk # tokenize
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize # tokenize
from nltk.corpus import stopwords #stopwords
import string # for punctuation
from collections import Counter # to get frequency of words
from nltk.stem import WordNetLemmatizer # to lemmatize
from nltk.corpus import wordnet

# INITIALIZATION
nltk.download('omw-1.4')
directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)


F:\Documents\Projects\McGill\McGill-INSY-669-GroupProject


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Konstantin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# COMMENT PARSING
def parse_comment(soup, company):
    """returns a comment dictionary given a comment soup
    
    INPUTS: beautiful soup object containing a comment
    OUTPUT: dictionary with relevant comment fields
    """
    comment={}

    # review id
    comment['id'] = soup['id']

    # title of the review
    title_text = soup.find("a", class_="reviewLink")
    comment['Title_Review'] = title_text.text

    # stars
    stars_text = soup.find("span", attrs={"class": "ratingNumber mr-xsm"})
    comment['Stars'] = float(stars_text.text)

    # company name
    comment['Company_name'] = company

    # recommend
    elem = soup.find('div', class_="recommends").contents[0].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Recommend'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Recommend'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Recommend'] = 'neutral'
    else: comment['Recommend'] = 'error'

    # ceo approval
    elem = soup.find('div', class_="recommends").contents[1].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['CEO_approval'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['CEO_approval'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['CEO_approval'] = 'neutral'
    else: comment['CEO_approval'] = 'error'

    # business outlook
    elem = soup.find('div', class_="recommends").contents[2].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Business_outlook'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Business_outlook'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Business_outlook'] = 'neutral'
    else: comment['Business_outlook'] = 'error'

    # pros
    pros_text = soup.find("span", attrs={"data-test": "pros"})
    comment['Pros'] = pros_text.text

    # cons
    cons_text = soup.find("span", attrs={"data-test": "cons"}) 
    comment['Cons'] = cons_text.text

    # employee seniority
    seniority_text = soup.find("span", attrs={"class": "pt-xsm pt-md-0 css-1qxtz39 eg4psks0"})
    comment['Employee_seniority'] = seniority_text.text

    # location
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    location = elem.text.split('-')[1].strip().split(" ")
    loc_true = False
    for word in location:
        if '\xa0in' in word: 
            loc_true = True

    if loc_true: comment['Location'] = " ".join(location[-2:])
    else:  comment['Location'] = None

    # date
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    comment['Date'] = elem.text.split('-')[0].strip()

    return (comment)

In [3]:
# THIS MODULE DOWNLOADS & PROCESSES THE PAGES
   
# Metadata 
COMPANY = "Amazon"
PAGES_TO_LOAD = 10000
pages = [p for p in range(PAGES_TO_LOAD)]
urls = ["https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P"+str(i+1)+".htm?filter.iso3Language=eng" for i in range(PAGES_TO_LOAD)]


# Main Function
def parse_page(p_n):
    """Pipeline to process a page"""

    # Downloads the Page
    page = requests.get(urls[p_n])

    # Extracts Comments
    soup = BeautifulSoup(page.text, "lxml")
    page_comments = soup.find_all('li', class_='empReview')
    
    # Cleans Comments
    page_comments_clean = []
    for comment in page_comments:
        page_comments_clean.append(parse_comment(comment, COMPANY))

    print(f"PAGE DONE: {p_n+1}, COMMENTS: {len(page_comments_clean)}")
    return(page_comments_clean)


# Prepare Folders
if not os.path.exists(os.path.join(directory, 'data', COMPANY)): os.mkdir(os.path.join(directory, 'data', COMPANY))


# Execute Function
comments_clean = []
for p in pages:
    comments_clean.extend(parse_page(p))


# with multiprocessing.pool.ThreadPool(multiprocessing.cpu_count()) as P:
#     for clean_com in P.map(parse_page, pages):
#         comments_clean.extend(clean_com)

PAGE DONE: 1, COMMENTS: 10
PAGE DONE: 2, COMMENTS: 0
PAGE DONE: 3, COMMENTS: 10
PAGE DONE: 4, COMMENTS: 0
PAGE DONE: 5, COMMENTS: 10
PAGE DONE: 6, COMMENTS: 0
PAGE DONE: 7, COMMENTS: 0
PAGE DONE: 8, COMMENTS: 10
PAGE DONE: 9, COMMENTS: 10
PAGE DONE: 10, COMMENTS: 0
PAGE DONE: 11, COMMENTS: 0
PAGE DONE: 12, COMMENTS: 0
PAGE DONE: 13, COMMENTS: 10
PAGE DONE: 14, COMMENTS: 0
PAGE DONE: 15, COMMENTS: 0
PAGE DONE: 16, COMMENTS: 10
PAGE DONE: 17, COMMENTS: 0
PAGE DONE: 18, COMMENTS: 0
PAGE DONE: 19, COMMENTS: 10


In [None]:
# THIS MODULE SAVES THE CLEAN DATA
df_clean = pd.DataFrame.from_records(comments_clean)
df_clean.to_csv(f'data/{COMPANY}/{COMPANY}_comments.csv', index=False)

In [None]:
df_clean = pd.read_csv(f'data/{COMPANY}/{COMPANY}_comments.csv')
df_clean

Unnamed: 0,id,Title_Review,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Pros,Cons,Employee_seniority,Location,Date
0,empReview_72295158,Good,5.0,Amazon,positive,positive,positive,Good opportunities and easy to transfer teams,Low performers are managed out,"Current Employee, more than 3 years","Toronto, ON","Jan. 3, 2023"
1,empReview_73254217,I was on a great team before I got laid off,5.0,Amazon,neutral,neutral,neutral,* Work life balance * Compensation * Team was ...,* Huge infrastructure is daunting to learn as ...,"Former Employee, less than 1 year","Vancouver, BC","Feb. 2, 2023"
2,empReview_73190433,job review,5.0,Amazon,positive,positive,positive,"good benefit, flexible time shift. take care o...","better organization of work, better car parkin...",Former Employee,,"Jan. 31, 2023"
3,empReview_73218068,Good Company b,5.0,Amazon,neutral,neutral,neutral,It is a good company,Over time could be hard to get,Current Employee,,"Feb. 1, 2023"
4,empReview_73197210,Growth Opportunity,4.0,Amazon,positive,negative,negative,"Fast paced, Start-Up Culture, Benefits","Compensation, Growth Prospects, Development Op...","Current Employee, more than 3 years","Vancouver, BC","Jan. 31, 2023"
...,...,...,...,...,...,...,...,...,...,...,...,...
425,empReview_64524154,Good work,3.0,Amazon,neutral,neutral,neutral,I am new so i have nothing to say yet,Soreness at starting and long hour work,Current Employee,"Hamilton, ON","May 22, 2022"
426,empReview_63815226,Very positive,5.0,Amazon,positive,positive,positive,"Great energy team, leadership is reliable","No, I think this is great","Former Employee, more than 1 year","Vancouver, BC","May 8, 2022"
427,empReview_64574946,Political,2.0,Amazon,neutral,negative,positive,$$$ is the biggest reason to join,Politics is the biggest reason to stay away,Current Employee,,"May 23, 2022"
428,empReview_63817981,Srhh,4.0,Amazon,neutral,neutral,neutral,Great good management fair work Awesome place,Long hours tough work management alright,"Former Employee, less than 1 year","Scarborough, ON","May 8, 2022"
