In [1]:
# GENERIC
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import os
import re
import tqdm

# VISUALIZATION
import plotly.express as px

# SENTIMENT ANALYSIS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# INITIALIZATION
directory = os.path.realpath(os.path.join(os.getcwd(),".."))
os.chdir(directory)
print(directory)

F:\Documents\Projects\McGill\McGill-INSY-669-GroupProject


In [2]:
amzn = pd.read_csv(os.path.join(directory, 'data', 'Amazon', 'Amazon_comments_processed.csv'))
amzn_drt = pd.read_csv(os.path.join(directory, 'data', 'Amazon', 'Amazon_comments.csv'))

### For sentiment analysis - the data has to be processed somewhat different

In [3]:
def clean_text(text):  
    text = str(text) 

    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'$', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'https?://[A-Za-z0-9./]+','',text)
    text = re.sub(r'https//[A-Za-z0-9./]+','',text)
    text = text.replace("\n","")
    text = re.sub('@[\w]+', '', text)

    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text) 

    text = text.lower()
    text = text.strip()

    return(text)


pros_list = amzn_drt.Pros.tolist()
cons_list = amzn_drt.Cons.tolist()

pros_clean = [clean_text(text) for text in tqdm.tqdm(pros_list)]
cons_clean = [clean_text(text) for text in tqdm.tqdm(cons_list)]

100%|██████████| 10000/10000 [00:00<00:00, 78668.91it/s]
100%|██████████| 10000/10000 [00:00<00:00, 87640.05it/s]


### Sentiment Analysis
estimate compound score

In [4]:
analyzer = SentimentIntensityAnalyzer()

pros_sent = []
for text in pros_clean:
    vs = analyzer.polarity_scores(text)
    pros_sent.append(vs)
pros_sent = pd.DataFrame(pros_sent)
pros_sent = pros_sent.rename(columns={'compound': 'pros_comp'})

cons_set = []
for text in cons_clean:
    vs = analyzer.polarity_scores(text)
    cons_set.append(vs)
cons_sent = pd.DataFrame(cons_set)
cons_sent = cons_sent.rename(columns={'compound': 'cons_comp'})

sent_df = pd.concat([pros_sent, cons_sent], axis=1)
sent_df['avg_comp'] = (sent_df['pros_comp'] + sent_df['cons_comp'])/2

### Sentiment Analysis
assign sentiment value

In [5]:
sent_df[['avg_sent', 'pros_sent', 'cons_sent']] = 'neutral'

sent_df['avg_sent'] = np.where(sent_df.avg_comp > 0.1, 'positive', sent_df.avg_sent)
sent_df['avg_sent'] = np.where(sent_df.avg_comp < -0.1, 'negative', sent_df.avg_sent)
sent_df['pros_sent'] = np.where(sent_df.pros_comp > 0.1, 'positive', sent_df.pros_sent)
sent_df['pros_sent'] = np.where(sent_df.pros_comp < -0.1, 'negative', sent_df.pros_sent)
sent_df['cons_sent'] = np.where(sent_df.cons_comp > 0.1, 'positive', sent_df.cons_sent)
sent_df['cons_sent'] = np.where(sent_df.cons_comp < -0.1, 'negative', sent_df.cons_sent)

sent_df = sent_df[['avg_sent','avg_comp', 'pros_sent', 'cons_sent']]

In [6]:
amzn = pd.concat([amzn, sent_df], axis=1)
amzn.to_csv(f'data/Amazon/Amazon_comments_sentiment.csv', index=False)
amzn

Unnamed: 0,Title_Review,Pros,Cons,id,Stars,Company_name,Recommend,CEO_approval,Business_outlook,Location,Date,Employee_type,Employee_sen,avg_sent,avg_comp
0,"['good', 'impression', 'first', 'month']","['documentation', 'amazon', 'super', 'importan...","['need', 'understand', 'job', 'need', 'improve...",empReview_73247758,5.0,Amazon,positive,neutral,neutral,"Toronto, ON",2023-02-02,current employee,<1 yrs,positive,0.42675
1,['intern'],"['4', 'day', 'shifts', 'nice']","['long', 'hour', 'shift', 'make', 'feel', 'tire']",empReview_73187609,5.0,Amazon,neutral,neutral,neutral,"Toronto, ON",2023-01-31,former employee,<1 yrs,neutral,-0.00945
2,['good'],"['great', 'work', 'balance', 'great', 'environ...","['workload', 'heavy', 'sometimes']",empReview_73188818,5.0,Amazon,positive,positive,positive,"Amazon, SK",2023-01-31,former employee,1-3 yrs,positive,0.42405
3,"['job', 'review']","['good', 'benefit', 'flexible', 'time', 'shift...","['good', 'organization', 'work', 'well', 'car'...",empReview_73190433,5.0,Amazon,positive,positive,positive,,2023-01-31,former employee,,positive,0.78765
4,"['growth', 'opportunity']","['fast', 'paced', 'start-up', 'culture', 'bene...","['compensation', 'growth', 'prospect', 'develo...",empReview_73197210,4.0,Amazon,positive,negative,negative,"Vancouver, BC",2023-01-31,current employee,3-5 yrs,positive,0.57340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"['great', 'pay', 'onboarding']","['amazon', 'wonderful', 'search', 'site', 'fin...","['interview', 'process', 'long', 'worth', 'end']",empReview_71536795,5.0,Amazon,positive,positive,positive,,2022-12-01,current employee,<1 yrs,positive,0.68275
9996,"['great', 'comp']","['great', 'company', 'easy', 'find', 'area', '...","['get', 'unlucky', 'team']",empReview_71537065,5.0,Amazon,neutral,neutral,neutral,,2022-12-01,current employee,,positive,0.42955
9997,"['far', 'good']","['great', 'teamwork', 'great', 'work', 'enviro...","['little', 'far', 'home']",empReview_71539933,5.0,Amazon,positive,positive,positive,"Querétaro, Querétaro",2022-12-01,current employee,<1 yrs,positive,0.42405
9998,"['use', 'great', 'company']","['become', 'excellent', 'problem', 'solver', '...","['cut-throat', 'management', 'toxic', 'culture...",empReview_71882994,2.0,Amazon,negative,negative,negative,"Diego, CA",2022-12-15,former employee,1-3 yrs,positive,0.11465


### Visualize Sentiments

In [7]:
fig = px.histogram(amzn.query('Employee_type == "former employee"'), 
                    x='avg_sent', color='Employee_sen', opacity=0.5,
                    barmode="group", histnorm='probability density', 
                    category_orders={"avg_sent": ["very positive", "positive", "neutral", 
                                                "negative", 'very negative'],
                                     "Employee_sen": ['<1 yrs', '1-3 yrs', '3-5 yrs', 
                                                      '5-8 yrs', '8-10 yrs', '>10 yrs']})
fig.show()

In [8]:
fig = px.histogram(amzn.query('Employee_type == "current employee"'), 
                    x='avg_sent', color='Employee_sen', opacity=0.5,
                    barmode="group", histnorm='probability density', 
                    category_orders={"avg_sent": ["very positive", "positive", "neutral", 
                                                "negative", 'very negative'],
                                     "Employee_sen": ['<1 yrs', '1-3 yrs', '3-5 yrs', 
                                                      '5-8 yrs', '8-10 yrs', '>10 yrs']})
fig.show()