# INSY669 - Text Analytics
## Course Project - Amazon Glassdoor Reviews

### Import libraries

In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import string
import re
import os

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import gensim.corpora as corpora
import gensim, logging, warnings
import matplotlib.colors as mcolors

import pprint
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)

/Users/konstantin/Documents/Projects/McGill/McGill-INSY-669-GroupProject


### Import data

In [13]:
amzn = pd.read_csv(os.path.join(directory, 'data', 'Amazon', 'Amazon_comments.csv'))

In [14]:
# drop columns not requiring processing
ignore_cols = ['id', 'Stars', 'Company_name', 'Recommend', 'CEO_approval', 
                'Employee_seniority', 'Business_outlook', 'Location', 'Date']

amzn_ign = amzn[ignore_cols]
amzn_pr = amzn.drop(columns=ignore_cols, axis = 1)
amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,Good impression in the first months,Documentation on Amazon is a super important p...,You need to understand your job and what you n...
1,intern,4 days shifts is nice,long hour shift can make you feel tired
2,good,"great work balance, great environment, locatio...",workload can be heavy sometimes
3,job review,"good benefit, flexible time shift. take care o...","better organization of work, better car parkin..."
4,Growth Opportunity,"Fast paced, Start-Up Culture, Benefits","Compensation, Growth Prospects, Development Op..."
...,...,...,...
9995,Great pay and onboarding,Amazon has wonderful search sites where you ca...,The interview process is very long but worth i...
9996,great comp,great company easy to find an area you like,can get unlucky with team.
9997,"so far,so good!","great teamwork, great working environment, per...",a little far from home
9998,Used to be a great company,You will become an excellent problem solver us...,"Cut-throat management and toxic culture, unnec..."


### Lowercase the Strings

In [15]:
# extract string columns
txt = amzn_pr.select_dtypes(include = object)

In [16]:
# apply lowercase to all string columns
for i in txt.columns:
    low = []
    amzn_pr[i] = amzn_pr[i].astype(str)
    for j in range(len(amzn_pr)):
        low.append(amzn_pr[i][j].lower())
    amzn_pr[i] = low

amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,good impression in the first months,documentation on amazon is a super important p...,you need to understand your job and what you n...
1,intern,4 days shifts is nice,long hour shift can make you feel tired
2,good,"great work balance, great environment, locatio...",workload can be heavy sometimes
3,job review,"good benefit, flexible time shift. take care o...","better organization of work, better car parkin..."
4,growth opportunity,"fast paced, start-up culture, benefits","compensation, growth prospects, development op..."
...,...,...,...
9995,great pay and onboarding,amazon has wonderful search sites where you ca...,the interview process is very long but worth i...
9996,great comp,great company easy to find an area you like,can get unlucky with team.
9997,"so far,so good!","great teamwork, great working environment, per...",a little far from home
9998,used to be a great company,you will become an excellent problem solver us...,"cut-throat management and toxic culture, unnec..."


### Remove Stopwords

In [17]:
stop_words = set(stopwords.words('english'))
punctuation_marks = set(string.punctuation)
for i in ['“','”','—','’','`','``','\'','\""']:
    punctuation_marks.add(i)

# define a function to remove stop words and punctuation marks
def remove_stop_words_and_punctuation(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word not in punctuation_marks]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

# apply the function to the 'text' column of the dataframe
for i in txt.columns:
    amzn_pr[i] = amzn_pr[i].apply(remove_stop_words_and_punctuation)
    
amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,good impression first months,documentation amazon super important point pro...,need understand job need improve better depend...
1,intern,4 days shifts nice,long hour shift make feel tired
2,good,great work balance great environment location far,workload heavy sometimes
3,job review,good benefit flexible time shift take care emp...,better organization work better car parking ar...
4,growth opportunity,fast paced start-up culture benefits,compensation growth prospects development oppo...
...,...,...,...
9995,great pay onboarding,amazon wonderful search sites find anything lo...,interview process long worth end
9996,great comp,great company easy find area like,get unlucky team
9997,far good,great teamwork great working environment perso...,little far home
9998,used great company,become excellent problem solver using data con...,cut-throat management toxic culture unnecessar...


### Tokenizing String

In [18]:
for i in txt.columns:
    amzn_pr[i] = amzn_pr[i].apply(nltk.word_tokenize)
    
amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,"[good, impression, first, months]","[documentation, amazon, super, important, poin...","[need, understand, job, need, improve, better,..."
1,[intern],"[4, days, shifts, nice]","[long, hour, shift, make, feel, tired]"
2,[good],"[great, work, balance, great, environment, loc...","[workload, heavy, sometimes]"
3,"[job, review]","[good, benefit, flexible, time, shift, take, c...","[better, organization, work, better, car, park..."
4,"[growth, opportunity]","[fast, paced, start-up, culture, benefits]","[compensation, growth, prospects, development,..."
...,...,...,...
9995,"[great, pay, onboarding]","[amazon, wonderful, search, sites, find, anyth...","[interview, process, long, worth, end]"
9996,"[great, comp]","[great, company, easy, find, area, like]","[get, unlucky, team]"
9997,"[far, good]","[great, teamwork, great, working, environment,...","[little, far, home]"
9998,"[used, great, company]","[become, excellent, problem, solver, using, da...","[cut-throat, management, toxic, culture, unnec..."


### Part of Speech Tagging

In [19]:
for i in txt.columns:
    amzn_pr[i] = amzn_pr[i].apply(nltk.pos_tag)
    
amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,"[(good, JJ), (impression, NN), (first, RB), (m...","[(documentation, NN), (amazon, VBZ), (super, J...","[(need, NN), (understand, VBP), (job, NN), (ne..."
1,"[(intern, NN)]","[(4, CD), (days, NNS), (shifts, JJ), (nice, JJ)]","[(long, RB), (hour, NN), (shift, NN), (make, V..."
2,"[(good, JJ)]","[(great, JJ), (work, NN), (balance, NN), (grea...","[(workload, NN), (heavy, NN), (sometimes, RB)]"
3,"[(job, NN), (review, NN)]","[(good, JJ), (benefit, NN), (flexible, JJ), (t...","[(better, JJR), (organization, NN), (work, NN)..."
4,"[(growth, NN), (opportunity, NN)]","[(fast, RB), (paced, JJ), (start-up, JJ), (cul...","[(compensation, NN), (growth, NN), (prospects,..."
...,...,...,...
9995,"[(great, JJ), (pay, NN), (onboarding, VBG)]","[(amazon, RB), (wonderful, JJ), (search, NN), ...","[(interview, NN), (process, NN), (long, RB), (..."
9996,"[(great, JJ), (comp, NN)]","[(great, JJ), (company, NN), (easy, JJ), (find...","[(get, VB), (unlucky, JJ), (team, NN)]"
9997,"[(far, RB), (good, JJ)]","[(great, JJ), (teamwork, NN), (great, JJ), (wo...","[(little, JJ), (far, RB), (home, NN)]"
9998,"[(used, VBN), (great, JJ), (company, NN)]","[(become, VB), (excellent, JJ), (problem, NN),...","[(cut-throat, JJ), (management, NN), (toxic, J..."


### Lemmatization

In [20]:
lemmatizer = WordNetLemmatizer()
def wordnet_pos(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    elif tag.startswith('J'):
        return wordnet.ADJ
    else:
        return wordnet.ADJ_SAT

def lemmatize_text(pos_tags):
    return [lemmatizer.lemmatize(token, wordnet_pos(pos_tag)).lower() for (token, pos_tag) in pos_tags]

for i in txt.columns:
    lem = []
    for j in range(len(amzn_pr)):
        lem.append(lemmatize_text(amzn_pr[i][j]))
    
    amzn_pr[i] = lem
    
amzn_pr

Unnamed: 0,Title_Review,Pros,Cons
0,"[good, impression, first, month]","[documentation, amazon, super, important, poin...","[need, understand, job, need, improve, good, d..."
1,[intern],"[4, day, shifts, nice]","[long, hour, shift, make, feel, tire]"
2,[good],"[great, work, balance, great, environment, loc...","[workload, heavy, sometimes]"
3,"[job, review]","[good, benefit, flexible, time, shift, take, c...","[good, organization, work, well, car, parking,..."
4,"[growth, opportunity]","[fast, paced, start-up, culture, benefit]","[compensation, growth, prospect, development, ..."
...,...,...,...
9995,"[great, pay, onboarding]","[amazon, wonderful, search, site, find, anythi...","[interview, process, long, worth, end]"
9996,"[great, comp]","[great, company, easy, find, area, like]","[get, unlucky, team]"
9997,"[far, good]","[great, teamwork, great, work, environment, pe...","[little, far, home]"
9998,"[use, great, company]","[become, excellent, problem, solver, use, data...","[cut-throat, management, toxic, culture, unnec..."


### Clean Employee_seniority

In [21]:
amzn_ign['Employee_type'] = amzn_ign.Employee_seniority.str.split(',').str[0].str.lower().str.strip()
amzn_ign['Employee_sen'] = amzn_ign.Employee_seniority.str.split(',').str[1].str.lower().str.strip()
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'less than 1 year', '<1 yrs', amzn_ign.Employee_sen)
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'more than 1 year', '1-3 yrs', amzn_ign.Employee_sen)
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'more than 3 years', '3-5 yrs', amzn_ign.Employee_sen)
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'more than 5 years', '5-8 yrs', amzn_ign.Employee_sen)
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'more than 8 years', '8-10 yrs', amzn_ign.Employee_sen)
amzn_ign['Employee_sen'] = np.where(amzn_ign.Employee_sen == 'more than 10 years', '>10 yrs', amzn_ign.Employee_sen)
amzn_ign = amzn_ign.drop(columns=['Employee_seniority'])

amzn_ign

Unnamed: 0,id,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Location,Date,Employee_type,Employee_sen
0,empReview_73247758,5.0,Amazon,positive,neutral,neutral,"Toronto, ON","Feb. 2, 2023",current employee,<1 yrs
1,empReview_73187609,5.0,Amazon,neutral,neutral,neutral,"Toronto, ON","Jan. 31, 2023",former employee,<1 yrs
2,empReview_73188818,5.0,Amazon,positive,positive,positive,"Amazon, SK","Jan. 31, 2023",former employee,1-3 yrs
3,empReview_73190433,5.0,Amazon,positive,positive,positive,,"Jan. 31, 2023",former employee,
4,empReview_73197210,4.0,Amazon,positive,negative,negative,"Vancouver, BC","Jan. 31, 2023",current employee,3-5 yrs
...,...,...,...,...,...,...,...,...,...,...
9995,empReview_71536795,5.0,Amazon,positive,positive,positive,,"Dec. 1, 2022",current employee,<1 yrs
9996,empReview_71537065,5.0,Amazon,neutral,neutral,neutral,,"Dec. 1, 2022",current employee,
9997,empReview_71539933,5.0,Amazon,positive,positive,positive,"Querétaro, Querétaro","Dec. 1, 2022",current employee,<1 yrs
9998,empReview_71882994,2.0,Amazon,negative,negative,negative,"Diego, CA","Dec. 15, 2022",former employee,1-3 yrs


### Clean date

In [22]:
amzn_ign['Date'] = pd.to_datetime(amzn['Date'])

amzn_ign

Unnamed: 0,id,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Location,Date,Employee_type,Employee_sen
0,empReview_73247758,5.0,Amazon,positive,neutral,neutral,"Toronto, ON",2023-02-02,current employee,<1 yrs
1,empReview_73187609,5.0,Amazon,neutral,neutral,neutral,"Toronto, ON",2023-01-31,former employee,<1 yrs
2,empReview_73188818,5.0,Amazon,positive,positive,positive,"Amazon, SK",2023-01-31,former employee,1-3 yrs
3,empReview_73190433,5.0,Amazon,positive,positive,positive,,2023-01-31,former employee,
4,empReview_73197210,4.0,Amazon,positive,negative,negative,"Vancouver, BC",2023-01-31,current employee,3-5 yrs
...,...,...,...,...,...,...,...,...,...,...
9995,empReview_71536795,5.0,Amazon,positive,positive,positive,,2022-12-01,current employee,<1 yrs
9996,empReview_71537065,5.0,Amazon,neutral,neutral,neutral,,2022-12-01,current employee,
9997,empReview_71539933,5.0,Amazon,positive,positive,positive,"Querétaro, Querétaro",2022-12-01,current employee,<1 yrs
9998,empReview_71882994,2.0,Amazon,negative,negative,negative,"Diego, CA",2022-12-15,former employee,1-3 yrs


In [23]:
amzn = pd.concat([amzn_pr, amzn_ign], axis=1)
amzn

Unnamed: 0,Title_Review,Pros,Cons,id,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Location,Date,Employee_type,Employee_sen
0,"[good, impression, first, month]","[documentation, amazon, super, important, poin...","[need, understand, job, need, improve, good, d...",empReview_73247758,5.0,Amazon,positive,neutral,neutral,"Toronto, ON",2023-02-02,current employee,<1 yrs
1,[intern],"[4, day, shifts, nice]","[long, hour, shift, make, feel, tire]",empReview_73187609,5.0,Amazon,neutral,neutral,neutral,"Toronto, ON",2023-01-31,former employee,<1 yrs
2,[good],"[great, work, balance, great, environment, loc...","[workload, heavy, sometimes]",empReview_73188818,5.0,Amazon,positive,positive,positive,"Amazon, SK",2023-01-31,former employee,1-3 yrs
3,"[job, review]","[good, benefit, flexible, time, shift, take, c...","[good, organization, work, well, car, parking,...",empReview_73190433,5.0,Amazon,positive,positive,positive,,2023-01-31,former employee,
4,"[growth, opportunity]","[fast, paced, start-up, culture, benefit]","[compensation, growth, prospect, development, ...",empReview_73197210,4.0,Amazon,positive,negative,negative,"Vancouver, BC",2023-01-31,current employee,3-5 yrs
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"[great, pay, onboarding]","[amazon, wonderful, search, site, find, anythi...","[interview, process, long, worth, end]",empReview_71536795,5.0,Amazon,positive,positive,positive,,2022-12-01,current employee,<1 yrs
9996,"[great, comp]","[great, company, easy, find, area, like]","[get, unlucky, team]",empReview_71537065,5.0,Amazon,neutral,neutral,neutral,,2022-12-01,current employee,
9997,"[far, good]","[great, teamwork, great, work, environment, pe...","[little, far, home]",empReview_71539933,5.0,Amazon,positive,positive,positive,"Querétaro, Querétaro",2022-12-01,current employee,<1 yrs
9998,"[use, great, company]","[become, excellent, problem, solver, use, data...","[cut-throat, management, toxic, culture, unnec...",empReview_71882994,2.0,Amazon,negative,negative,negative,"Diego, CA",2022-12-15,former employee,1-3 yrs


In [24]:
amzn.to_csv(f'data/Amazon/Amazon_comments_processed.csv', index=False)