In [1]:
# WEB SCRAPING
import requests #for scrapping
from bs4 import BeautifulSoup #for scrapping

# GENERIC
import pandas as pd #data transformation
import itertools #combinations
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt # for plotting
import os
import pickle
import nest_asyncio
nest_asyncio.apply()
import asyncio
import aiohttp

# TEXT WRANGLING
import nltk # tokenize
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize # tokenize
from nltk.corpus import stopwords #stopwords
import string # for punctuation
from collections import Counter # to get frequency of words
from nltk.stem import WordNetLemmatizer # to lemmatize
from nltk.corpus import wordnet

# INITIALIZATION
nltk.download('omw-1.4')
directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)


F:\Documents\Projects\McGill\McGill-INSY-669-GroupProject


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Konstantin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# COMMENT PARSING
def parse_comment(soup, company):
    """returns a comment dictionary given a comment soup
    
    INPUTS: beautiful soup object containing a comment
    OUTPUT: dictionary with relevant comment fields
    """
    comment={}

    # review id
    comment['id'] = soup['id']

    # title of the review
    title_text = soup.find("a", class_="reviewLink")
    comment['Title_Review'] = title_text.text

    # stars
    stars_text = soup.find("span", attrs={"class": "ratingNumber mr-xsm"})
    comment['Stars'] = float(stars_text.text)

    # company name
    comment['Company_name'] = company

    # recommend
    elem = soup.find('div', class_="recommends").contents[0].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Recommend'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Recommend'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Recommend'] = 'neutral'
    else: comment['Recommend'] = 'error'

    # ceo approval
    elem = soup.find('div', class_="recommends").contents[1].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['CEO_approval'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['CEO_approval'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['CEO_approval'] = 'neutral'
    else: comment['CEO_approval'] = 'error'

    # business outlook
    elem = soup.find('div', class_="recommends").contents[2].find('svg', class_="SVGInline-svg")
    if 'css-hcqxoa-svg' in elem['class']: comment['Business_outlook'] = 'positive'
    elif 'css-1kiw93k-svg' in elem['class']: comment['Business_outlook'] = 'negative'
    elif 'css-10xv9lv-svg' in elem['class']: comment['Business_outlook'] = 'neutral'
    else: comment['Business_outlook'] = 'error'

    # pros
    pros_text = soup.find("span", attrs={"data-test": "pros"})
    comment['Pros'] = pros_text.text

    # cons
    cons_text = soup.find("span", attrs={"data-test": "cons"}) 
    comment['Cons'] = cons_text.text

    # employee seniority
    seniority_text = soup.find("span", attrs={"class": "pt-xsm pt-md-0 css-1qxtz39 eg4psks0"})
    comment['Employee_seniority'] = seniority_text.text

    # location
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    location = elem.text.split('-')[1].strip().split(" ")
    loc_true = False
    for word in location:
        if '\xa0in' in word: 
            loc_true = True

    if loc_true: comment['Location'] = " ".join(location[-2:])
    else:  comment['Location'] = None

    # date
    elem = soup.find('span', class_='common__EiReviewDetailsStyle__newUiJobLine')
    comment['Date'] = elem.text.split('-')[0].strip()

    return (comment)

In [6]:
# THIS MODULE DOWNLOADS / READS DATA FOR A SPECIFIC COMPANY
COMPANY = "Amazon"
urls = ["https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P"+str(i)+".htm?filter.iso3Language=eng" for i in range(5)]
comments = []

async def get_page_comments(session, url):
    async with session.get(url) as response:
        response_text = await response.text()
        soup = BeautifulSoup(response_text, "html.parser")
        page_comments = soup.find_all('li', class_='empReview')
        print("READ: ", url)
        return(page_comments)

async def download_comments():
    async with aiohttp.ClientSession() as session:

        tasks = []
        for url in urls:
            tasks.append(asyncio.ensure_future(get_page_comments(session, url)))

        pages_comments = await asyncio.gather(*tasks)
        for page_comments in pages_comments:
            comments.extend(page_comments)

asyncio.run(download_comments())
print(len(comments))

STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P0.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P1.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P2.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P3.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P4.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P5.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P6.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P7.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P8.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P9.htm?filter.iso3Language=eng
STARTED:  https://www.glassdoor.ca/Reviews/Amazon-

NameError: name 'comments' is not defined

In [None]:
comments = []
for url in urls:
    download_comments(url)
print(len(comments))

READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P0.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P1.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P2.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P3.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P4.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P5.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P6.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P7.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P8.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P9.htm?filter.iso3Language=eng
READ:  https://www.glassdoor.ca/Reviews/Amazon-Reviews-E6036_P10.htm?filter.iso3

KeyboardInterrupt: 

In [None]:
# THIS MODULE CLEANS THE COMMENT DATA
comments_clean = []
for comment in comments:
    comments_clean.append(parse_comment(comment, COMPANY))

df_clean = pd.DataFrame.from_records(comments_clean)
df_clean.to_csv(f'data/{COMPANY}_comments.csv', index=False)