## Import libraries

In [1]:
import datetime
from datetime import datetime

import numpy as np
import pandas as pd

import gensim
from gensim.models import HdpModel
from gensim.models.coherencemodel import CoherenceModel

import spacy 
from spacy import displacy

import matplotlib.pyplot as plt

import pickle
import os
import sys
sys.path.append(os.path.join(os.environ['PWD'],'scripts'))

from scrap import get_comments
from preprocess_text import preprocess_raw_comments

## Collect comments for January 2022

In [2]:
data_path = os.path.join(os.environ['PWD'],'data/january_comments.pkl')

if not os.path.isfile(data_path):
    start_date = datetime.strptime('2022-01-01',"%Y-%m-%d")
    end_date = datetime.strptime('2022-02-01',"%Y-%m-%d")
    
    raw_comments = get_comments(start_date, end_date) 
    
    pickling_on = open(data_path,"wb")
    pickle.dump(raw_comments, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    raw_comments = pickle.load(pickle_off)
    print('data unpickled successfully!')

data unpickled successfully!


In [3]:
len(raw_comments)

453

## Preprocess textual data for analysis

In [4]:
data_path = os.path.join(os.environ['PWD'],'data/january_comments_processed.pkl')

if not os.path.isfile(data_path):

    preprocessed_text = preprocess_raw_comments(raw_comments) 
    
    pickling_on = open(data_path,"wb")
    pickle.dump(preprocessed_text, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    preprocessed_text = pickle.load(pickle_off)
    print('data unpickled successfully!')

data unpickled successfully!


## Analyse rating scores

In [5]:
preprocessed_text['ratings']

Unnamed: 0,pos,neg
Career Growth,288,83
Work Satisfaction,297,80
Job Security,324,63
Skill Development,325,66
Work-Life Balance,326,62
Salary & Benefits,351,50
Company Culture,382,35


- Positive and Negative categories are highly negatively correlated
- Overall employees are mostly satisfied with company culture, salary and benefits
- Overall employees are mostly dissatisfied with career growth and work satisfaction

## Analyse comments

### Positive comments

In [6]:
data_path = os.path.join(os.environ['PWD'],'data/pos_comments_HdpModel.pkl')

if not os.path.isfile(data_path):
    
    hdp_model = HdpModel(corpus=preprocessed_text['pos_corpus'], 
                                        id2word=preprocessed_text['pos_dictionary'],chunksize=10)
    
    coh_model = CoherenceModel(model=hdp_model, texts=preprocessed_text['pos_texts'], 
                        dictionary=preprocessed_text['pos_dictionary'],coherence='c_v')

    coherence = coh_model.get_coherence()  # get coherence value 
        
    pos_topic_model = {}
    pos_topic_model['coherence'] = coherence
    pos_topic_model['coh_model'] = coh_model
    pos_topic_model['hdp_model'] = hdp_model
    
    pickling_on = open(data_path,"wb")
    pickle.dump(pos_topic_model, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    pos_topic_model = pickle.load(pickle_off)
    print('data unpickled successfully!')

data unpickled successfully!


In [7]:
print('Number of topics: {}, coherence score: {}.'.format(pos_topic_model['hdp_model'].get_topics().shape[0],pos_topic_model['coherence']))

Number of topics: 150, coherence score: 0.7395163734824113.


Let choose 5 most significant topics.

In [8]:
n_topics = 5
hdp_model = pos_topic_model['hdp_model']
hdp_model.optimal_ordering()
dictionary = preprocessed_text['pos_dictionary']

In [9]:
data_frame = pd.DataFrame()
for i in range(n_topics):
    data_frame['Topic'+str(i)] = pd.Series(index=dictionary.values(),data=hdp_model.get_topics()[i]).sort_values(ascending=False).index.values

In [10]:
data_frame.iloc[:20]

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4
0,initial,urgent,festival,come,team
1,believe,vedika,try,company,culture
2,role,leave,huge_cafeteria,honest,expect
3,process,situation,extra,great_company,theek
4,literally,holiday,atmosphere,customer_service,organization
5,compare,people,voucher,icqa,bad
6,manipulate,customer,minute,fair,resolve
7,skill,excellent,bad,role,feel_like
8,past,nice_place,money,obey,extraction
9,woman,drastically,marketing,knowledge,environment


It is quite hard to determine topics from given words

In [11]:
data = pd.DataFrame(columns=['Topic','prob','index'])
for index, i in enumerate(hdp_model[preprocessed_text['pos_corpus']]):
    for j in i:
        data.loc[data.shape[0]] = [j[0],j[1],index]

In [12]:
data = data.sort_values(by=['Topic','prob'],ascending=[True,False])

In [13]:
for topic in range(n_topics):
    print('Topic {}, most representative comments:'.format(topic))
    comments = data.loc[data.Topic==topic]['index']
    comments = comments[:min(len(comments),5)]
    for comment in comments:
        print('{}: {}'.format(comment, preprocessed_text['pos_texts_comment'][int(comment)].text.strip()))
    print('\n')

Topic 0, most representative comments:
112.0: Literally nothing to like about this role.This role doesn't make any sense.you will not even get reasons for what and why are you doing your job. Unnecessarily Extremely high pressure environment.0% job security.worst work culture.They don't give leave.I was COVID positive and was having 103 F fever.They didn't give me leave untill I gave them my reports.They manipulate you by saying shrinkage target.Even if someone is dead in your family.They will give this shrinkage nonsense.Growth in this process is like mission impossible.They lure you by their sweet talk in initial days but after some time they show their true colours.This process is very much infamous in Amazon itself.Other processes don't even consider this process.If you think that you will take internal transfer after sometime,then it is not possible.No one even call you back.I literally wasted my whole year over here.Guys please do not apply for this role.Its a trap.you will think

It is also hard to determine topics from given examples.

### Negative comments

In [14]:
data_path = os.path.join(os.environ['PWD'],'data/neg_comments_HdpModel.pkl')

if not os.path.isfile(data_path):
    
    hdp_model = HdpModel(corpus=preprocessed_text['neg_corpus'], 
                                        id2word=preprocessed_text['neg_dictionary'],chunksize=10)
    
    coh_model = CoherenceModel(model=hdp_model, texts=preprocessed_text['neg_texts'], 
                        dictionary=preprocessed_text['neg_dictionary'],coherence='c_v')

    coherence = coh_model.get_coherence()  # get coherence value 
        
    neg_topic_model = {}
    neg_topic_model['coherence'] = coherence
    neg_topic_model['coh_model'] = coh_model
    neg_topic_model['hdp_model'] = hdp_model
    
    pickling_on = open(data_path,"wb")
    pickle.dump(neg_topic_model, pickling_on)
    pickling_on.close()
    print('data pickled successfully!')
else:
    pickle_off = open(data_path, 'rb')
    neg_topic_model = pickle.load(pickle_off)
    print('data unpickled successfully!')

data unpickled successfully!


In [15]:
print('Number of topics: {}, coherence score: {}.'.format(neg_topic_model['hdp_model'].get_topics().shape[0],neg_topic_model['coherence']))

Number of topics: 150, coherence score: 0.7725791113454047.


In [16]:
n_topics = 5
hdp_model = neg_topic_model['hdp_model']
hdp_model.optimal_ordering()
dictionary = preprocessed_text['neg_dictionary']
    
data_frame = pd.DataFrame()
for i in range(n_topics):
    data_frame['Topic'+str(i)] = pd.Series(index=dictionary.values(),data=hdp_model.get_topics()[i]).sort_values(ascending=False).index.values
    
data_frame.iloc[:20]

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4
0,one.you,happen,quarter,calculation,degrade
1,indians,reason,need,performance,ball
2,change,state,totali,requirement,thwre
3,want,excellent,condition,contractual_employee,growth_opportunity
4,schedule,follow,let,select,letter
5,new,si,noida,volatile,proud
6,apply,utilise,pstam,escalation,frustration
7,like,finally,growth_slow,know,extreme
8,number,jeff,beautiful,department,few
9,opportunity,shift,india,comany,fault


It is quite hard to determine topics from given words

In [17]:
data = pd.DataFrame(columns=['Topic','prob','index'])
for index, i in enumerate(hdp_model[preprocessed_text['neg_corpus']]):
    for j in i:
        data.loc[data.shape[0]] = [j[0],j[1],index]
        
data = data.sort_values(by=['Topic','prob'],ascending=[True,False])

for topic in range(n_topics):
    print('Topic {}, most representative comments:'.format(topic))
    comments = data.loc[data.Topic==topic]['index']
    comments = comments[:min(len(comments),5)]
    for comment in comments:
        print('{}: {}'.format(comment, preprocessed_text['neg_texts_comment'][int(comment)].text.strip()))
    print('\n')

Topic 0, most representative comments:
41.0: When you join they will treat you well for 2-3 months but after that, they will show their true colours. Enormous work pressure and you will be forced to achieve their customer satisfaction goals which are way beyond achievable. Jeff wants Amazon to be the world's most customer eccentric company but don't want to pay for it. Compensation is way below the market standards. There is no chance for growth even some of the managers agreed on. They will change your shift and schedule whenever they want and they always had the same answer for this "Business requirement".  They always say we will listen to issues and they do listen after making changes which certainly makes their employee's life hell. I think Amazon is just exploiting Indians and taking advantage of cheap labour
80.0: Internal employees are taken leverage in all terms. Moving to a new team within org is too difficult and also hiring manager pay less for internal transfer than extern

It is also hard to determine topics from given examples. Some topics have only 2 examples assigned to them.