# Lecture 11. Text analysis

The code below has been adapted from the following:
    https://www.kaggle.com/code/adepvenugopal/sentiment-analysis-of-glassdoor-review/notebook
    
The data file can be downloaded here:
    https://www.kaggle.com/datasets/dhirajnimbalkar/topicmodellinghoneywellglassdoorreviews
    
        

In [26]:
import pandas as pd
import numpy as np #calculations
import matplotlib.pyplot as plt
import seaborn as sns


from textblob import TextBlob #package that let us create work and create text data


In [27]:
data = pd.read_csv('glassdoortest1.csv',encoding = 'unicode_escape')

In [28]:
display(data.head())

print(data.shape)

Unnamed: 0.1,Unnamed: 0,date,title,pros,cons
0,1,13-Apr-18,Good Company to Work For,"Great Pay, Flexible Hours, Unlimited Vacation.","Health Care, 401K, nothing else really"
1,2,16-Apr-18,First Impressions,Great staff and very helpful. Fair compensatio...,As a relatively new employee I have not experi...
2,3,12-Apr-18,Sr. Engineering Technologist,"Great benefits, working condition and people!",Must make a 30 mile commute.
3,4,11-Apr-18,Environmental Specialist,Very supportive environment for learning new t...,Corporate is slow in taking decisions
4,5,12-Apr-18,Sales,"Work Life Balance, Slightly above average pay,...","High health care cost, high employee cost of c..."


(2000, 5)


In [29]:
from wordcloud import WordCloud

def wc(data,bgcolor,title):
    plt.figure(figsize = (50,50))
    wc = WordCloud(background_color = bgcolor, max_words = 2000, random_state=42, max_font_size = 50)
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
comm = data

polarity=[] #
subjectivity=[] 

for i in comm['pros'].values:
    try:
        analysis =TextBlob(i)
        polarity.append(analysis.sentiment.polarity)
        subjectivity.append(analysis.sentiment.subjectivity)
        
    except:
        polarity.append(0)
        subjectivity.append(0)
        
        
comm['polarity']=polarity
comm['subjectivity']=subjectivity

In [None]:
comm.head()

In [None]:
comm[['title','pros','polarity','subjectivity']][comm.polarity<-0.25].head(10)

In [None]:
comm[['title','pros','polarity','subjectivity']][comm.polarity>0.8].head(10)

In [None]:
wc(comm['pros'][comm.polarity>0.8],'black','Common Words' )

In [None]:
wc(comm['pros'][comm.polarity<-0.4],'black','Common Words' )

In [None]:
comm.polarity.hist(bins=50)

In [None]:
comm.subjectivity.hist(bins=50)

In [None]:
comm['polarity'][comm.polarity==0]= 0
comm['polarity'][comm.polarity > 0]= 1
comm['polarity'][comm.polarity < 0]= -1

In [None]:
comm.polarity.value_counts().plot.bar()
comm.polarity.value_counts()

Let's now look at more advanced ways to analyze text data

The code was adapted from here: 
    https://www.kaggle.com/code/rohan74/reviews-glassdoor

In [30]:
import re


import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet

#Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

#Spell Correction
from autocorrect import Speller

#Tokenization
import wordninja


#plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

# contractions is a library for converting words like "I'm" to "I am"
import contractions

# Necessary Libraries to find similarity
import math
from collections import Counter

# initializing spell checker 
spell = Speller(lang='en')

ModuleNotFoundError: No module named 'autocorrect'

In [None]:
stopwordslist= ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'into',
 'through',
 'during',
 'before',
 'after',
 'to',
 'from',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'should',
 "should've",
 'now',
 'ma']

In [None]:
df = data
df.columns = df.columns.str.lower()

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
lemmatizer = WordNetLemmatizer()
# function to extract extract noun, adjective, verb and adverbs from the text
# and tag pos for better lemmatization
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# preprocessing of text
def text_preprocess(text):
    if (len(text)>0):
        # contractions is a library for converting words like "I'm" to "I am"
        text=contractions.fix(text)
        
        #Removing all the special characters from the review text
        for char in '!#$%&@?,.:;+-*/=<>"\'()[\\]X{|}~\n\t1234567890':
            text = str(text).replace(char, ' ')
            
        #Converting all the words in review into lower case
        text=text.lower()
        
        #splitting the words in a sentence.
        word_list = wordninja.split(text)
        
        #removing stopwords from customzied stopwordlists 
        #and considering only word of length greater than 2
        word_list=[spell(w) for w in word_list if w not in stopwordslist and len(w) > 2]
        
#       extract noun, adjective, verb and adverbs from the text and perform lemmatizton   
        lemmatized_text=' '.join(([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_list]))
        
        return lemmatized_text
    else:
        return ''

# cleaning the pros, cons and reviewTitle
df['cleanpros'] = df['pros'].apply(lambda text: text_preprocess(text))
df['cleancons'] = df['cons'].apply(lambda text: text_preprocess(text))


#Combing all the clean text information given by the user
df['cleantext'] = df[['cleanpros', 'cleancons']].apply(lambda x: ' '.join(x), axis=1)

In [None]:
df.head()

In [None]:
def plotwordcountdistribution(df,reviewtype):
    #reviewtyepe is 'Pros_Modified_Text','Cons_Modified_Text' in our case
    def comment_len(x):
        if type(x) is str:
            return len(x.split())
        else:
            return 0
    df['review_len'] = df[reviewtype].apply(comment_len)
    length_scale=[0,5,15,25,50,75,100,200,10000]#Change length scale according to your requirement
    out = pd.cut(df['review_len'],length_scale)
    ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(12,8))
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.040, p.get_height() * 1.015))
    plt.title('Word Count distribution of {}'.format(reviewtype))
    plt.show()
    df.drop(columns=['review_len'])
    
plotwordcountdistribution(df,'cleanpros')
plotwordcountdistribution(df,'cleancons')

In [None]:
nltk.download('vader_lexicon')

In [None]:
# Perform sentiment analysis on cleantext

analyzer=SentimentIntensityAnalyzer()
def polarity_score(text):
    if len(text)>0:
        score=analyzer.polarity_scores(text)['compound']
        return score
    else:
        return 0
df['polarityscore'] =df['cleantext'].apply(lambda text : polarity_score(text))

In [None]:
def sentianamolybarplot(df):
    polarity_scale=[0.0,0.2,0.4,0.6,0.8,1]
    #'Review_polarity' is column name of sentiment score calculated for whole review.
    df2=df[(df['polarityscore']>0)]
    out = pd.cut(df2['polarityscore'],polarity_scale)
    ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(12,8))
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.040, p.get_height() * 1.015))
    plt.show()
sentianamolybarplot(df)

In [None]:
def top_n_trigram(corpus):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

In [None]:
def top_n_bigram(corpus):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq

In [None]:
dict_trigrams_pros= {}
dict_trigrams_cons= {}

dict_trigrams_pros = top_n_trigram(df['cleanpros'])

dict_trigrams_cons = top_n_trigram(df['cleancons'])

In [None]:
def bar_plot_toptrigrams(trigrams, reviewtype = 'Pros'):
    common_words = trigrams[:20]
    df1 = pd.DataFrame(common_words, columns = ['word' , 'count'])
    fig = px.bar(df1, x='word', y='count')
    fig.update_layout(title_text= '{0} Review Tri-gram count top 20'.format(reviewtype), template="plotly_white")
    fig.show()

bar_plot_toptrigrams(dict_trigrams_pros)
bar_plot_toptrigrams(dict_trigrams_cons,'Cons') 