## Import libraries

In [1]:
import datetime
from datetime import datetime

import pandas as pd

import gensim
from gensim.corpora import Dictionary
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

import spacy 

import pickle
import os
import sys
sys.path.append(os.path.join(os.environ['PWD'],'scripts'))

from scrap import get_comments

## Collect comments for January 2022

In [2]:
data_path = os.path.join(os.environ['PWD'],'data/january_comments.pkl')

if not os.path.isfile(data_path):
    start_date = datetime.strptime('2022-01-01',"%Y-%m-%d")
    end_date = datetime.strptime('2022-02-01',"%Y-%m-%d")
    
    data = get_comments(start_date, end_date) 
    
    pickling_on = open(data_path,"wb")
    pickle.dump(data, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    data = pickle.load(pickle_off)
    print('data unpickled successfully!')

data unpickled successfully!


In [3]:
len(data)

453

## Preprocess textual data for analysis

In [4]:
# Separate comments from ratings
pos_text = []
pos_cat = {}
neg_text = []
neg_cat = {}

for review in data:
    pos_text.append(review[0][0])
    
    for cat in review[0][1]:
        if cat not in pos_cat.keys():
            pos_cat[cat] = 1
        else:
            pos_cat[cat] += 1
        
    neg_text.append(review[1][0])
    
    for cat in review[1][1]:
        if cat not in neg_cat.keys():
            neg_cat[cat] = 1
        else:
            neg_cat[cat] += 1

## Rating scores

In [5]:
pos = pd.Series(pos_cat, name = 'pos')

In [6]:
neg = pd.Series(neg_cat, name = 'neg')

In [7]:
ratings = pd.merge(pos,neg, right_index = True, left_index = True, how='outer').sort_values('pos')

In [8]:
ratings

Unnamed: 0,pos,neg
Career Growth,288,83
Work Satisfaction,297,80
Job Security,324,63
Skill Development,325,66
Work-Life Balance,326,62
Salary & Benefits,351,50
Company Culture,382,35


## Comments

### Positive comments

In [9]:
# Explore textual data
pos_text[:40]

['Everything work culture, seniors they are very supportive and superb is the word to describe this company.',
 'Work Life balance and career growth ',
 'Good company for skill development',
 'Great company, terrible process',
 'Work life balance and work culture is something which I love the most.',
 "Amazon is a good company to work.But ML data associate role is not much intresting to work for. Very boaring job..no growth no skills. Don't apply for it.",
 'Work culture ',
 '01',
 'Work Culture',
 'Work is amazing. Volume of work is so much that you can have 4x experience when compared to other places. You can not sit and time pass without adding value.',
 'Good work environment and culture. Teammates are supportive and the management is transparent. Everyone is happy to connect with you and share their experiences and help you in your career.',
 'Everything is great about this company! ',
 'Iam very intrested to work in it coustmer service. ',
 'Management',
 'I can handling cashier'

In [10]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fd0156c5e80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fd0156c5ca0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fd0159999e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fd0156c0c00>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fd0156c3640>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fd015999970>)]

In [11]:
# Disable NER component
nlp.disable_pipe('ner')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fd0156c5e80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fd0156c5ca0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fd0159999e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fd0156c0c00>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fd0156c3640>)]

In [12]:
# Add some stop words to improve analysis results
nlp.Defaults.stop_words |= {'amazon','work','job','good'}

In [13]:
pos_texts = []
pos_texts_comment = []

for comment in pos_text:
    comment = nlp(comment)
    text = []
    for word in comment:
        # Exclude special characters, stopwords, punctuations and numbers
        if len(word.text.replace(" ", "")) != 0 and word.text.replace(" ", "")[0] != "\r" and not word.is_stop\
        and not word.is_punct and not word.like_num:
            text.append(word.lemma_.lower().replace(" ", ""))
        
    if len(text) > 0:
        pos_texts.append(text)
        pos_texts_comment.append(comment)

In [14]:
# Explore processed data
pos_texts[:40]

[['culture', 'senior', 'supportive', 'superb', 'word', 'describe', 'company'],
 ['life', 'balance', 'career', 'growth'],
 ['company', 'skill', 'development'],
 ['great', 'company', 'terrible', 'process'],
 ['life', 'balance', 'culture', 'love'],
 ['company',
  'ml',
  'data',
  'associate',
  'role',
  'intreste',
  'boaring',
  'growth',
  'skill',
  'apply'],
 ['culture'],
 ['culture'],
 ['amazing',
  'volume',
  '4x',
  'experience',
  'compare',
  'place',
  'sit',
  'time',
  'pass',
  'add',
  'value'],
 ['environment',
  'culture',
  'teammate',
  'supportive',
  'management',
  'transparent',
  'happy',
  'connect',
  'share',
  'experience',
  'help',
  'career'],
 ['great', 'company'],
 ['iam', 'intreste', 'coustmer', 'service'],
 ['management'],
 ['handle', 'cashier'],
 ['love',
  'co',
  'operative',
  'environment',
  'yes',
  'co',
  'operative',
  'constructive',
  'feel',
  'lucky',
  'start',
  'carrier',
  'yes',
  'look',
  'forward',
  'kind',
  'overall',
  'love',

In [15]:
# Create bigrams, dictionary and corpus
pos_bigram = Phrases(pos_texts, min_count=1, threshold=2, connector_words=ENGLISH_CONNECTOR_WORDS)
pos_texts = [pos_bigram[line] for line in pos_texts]
pos_dictionary = Dictionary(pos_texts)
pos_corpus = [pos_dictionary.doc2bow(text) for text in pos_texts]

### Negative comments

In [16]:
neg_texts = []
neg_texts_comment = []

for comment in neg_text:
    comment = nlp(comment)
    text = []
    for word in comment:
        # Exclude special characters, stopwords, punctuations and numbers
        if len(word.text.replace(" ", "")) != 0 and word.text.replace(" ", "")[0] != "\r" and not word.is_stop\
        and not word.is_punct and not word.like_num:
            text.append(word.lemma_.lower().replace(" ", ""))
        
    if len(text) > 0:
        neg_texts.append(text)
        neg_texts_comment.append(comment)

In [17]:
# Create bigrams, dictionary and corpus
neg_bigram = Phrases(neg_texts, min_count=1, threshold=2, connector_words=ENGLISH_CONNECTOR_WORDS)
neg_texts = [neg_bigram[line] for line in neg_texts]
neg_dictionary = Dictionary(neg_texts)
neg_corpus = [neg_dictionary.doc2bow(text) for text in neg_texts]

### Pickle everything

In [18]:
data_path = os.path.join(os.environ['PWD'],'data/january_comments_processed.pkl')

if not os.path.isfile(data_path):
    data = {}
    data['ratings'] = ratings
    
    data['pos_texts'] = pos_texts
    data['pos_dictionary'] = pos_dictionary
    data['pos_corpus'] = pos_corpus
    data['pos_texts_comment'] = pos_texts_comment
    
    data['neg_texts'] = neg_texts
    data['neg_dictionary'] = neg_dictionary
    data['neg_corpus'] = neg_corpus
    data['neg_texts_comment'] = neg_texts_comment
    
    pickling_on = open(data_path,"wb")
    pickle.dump(data, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    data = pickle.load(pickle_off)
    print('data unpickled successfully!')

data pickled successfully!
