In [2]:
# Enabling print for all lines
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Checking the working directory
import os
os.getcwd()

'C:\\Users\\kalya\\Python'

In [3]:
# Installing necessary libraries
#!pip install vaderSentiment

import numpy as np
import pandas as pd
import string
import nltk
import sklearn

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Downloading vader and stopwords
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kalya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**`Different types of encoding which could be tried are 'latin1', 'iso-8859-1', 'cp1252', 'utf-8' etc`**

In [8]:
# Reading the data set
# text = pd.read_csv("feedback_from_apr_aug_2018.csv", usecols = ['Issue Area','Issue Experienced'], encoding = 'latin').dropna()
text = pd.read_csv("feedback_from_apr_aug_2018.csv", encoding = 'latin')
# text
text1= text['Issue Experienced'].dropna()
# text1.head()
# text.info()
text.isnull().sum()*100/len(text)

Name                        0.000000
Response ID                 0.000000
Date collected              0.000000
VisitorId (Tealeaf)         0.304947
Tealeaf Link                0.508245
Logic: Page collected on    0.225887
Virtual: Device             5.692342
Issue Area                  0.056472
Issue Experienced           0.000000
Sub Reason                  0.000000
Problem statement           0.000000
dtype: float64

**[Sentiment Analysis using VADER](https://github.com/cjhutto/vaderSentiment)**

*`VADER(Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media`*

Recent updates in VADER can handle:

    > typical negations (e.g., "not good")
    > use of contractions as negations (e.g., "wasn't very good")
    > conventional use of punctuation to signal increased sentiment intensity (e.g., "Good!!!")
    > conventional use of word-shape to signal emphasis (e.g., using ALL CAPS for words/phrases)
    > using degree modifiers to alter sentiment intensity (e.g., intensity boosters such as "very" and intensity dampeners such as "kind of")
    > understanding many sentiment-laden slang words (e.g., 'sux')
    > understanding many sentiment-laden slang words as modifiers such as 'uber' or 'friggin' or 'kinda'
    > understanding many sentiment-laden emoticons such as :) and :D
    > translating utf-8 encoded emojis such as 💘 and 💋 and 😁
    > understanding sentiment-laden initialisms and acronyms (for example: 'lol')
    
    
[Refer this article](https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f)

In [9]:
# Creating a function fpr generating polarity scores
senti = SentimentIntensityAnalyzer()
text_polar = lambda x: senti.polarity_scores(x)['compound']

# Generating the polarity score on the 'Issue Experienced' variable
text['Polarity_score'] = text['Issue Experienced'].apply(text_polar)
text_final= text[['Issue Area','Issue Experienced','Polarity_score']]
# text_final.head()

# Categorizing the polarity
sentiment_mapping = {1: "High Negative", 2: "Negative", 3: "Neutral", 4:"Positive", 5:"High Positive"}
map_sentiment = lambda val : np.digitize(val,[-1,-.8,-.25,.25,.8])
text_final['sentiment_category'] = text_final['Polarity_score'].apply(map_sentiment)
# text_final.head()

# Count of polarity per Issue Area
positives_count = text_final[(text_final.sentiment_category == 4) |(text_final.sentiment_category == 5)].count()
positives_count
negatives_count = text_final[(text_final.sentiment_category == 2) |(text_final.sentiment_category == 1)].count()
negatives_count
neutral_count = text_final[(text_final.sentiment_category == 3)].count()
neutral_count

Issue Area            2118
Issue Experienced     2118
Polarity_score        2118
sentiment_category    2118
dtype: int64

Issue Area            3004
Issue Experienced     3006
Polarity_score        3006
sentiment_category    3006
dtype: int64

Issue Area            3727
Issue Experienced     3730
Polarity_score        3730
sentiment_category    3730
dtype: int64

**Topic Modeling**

In [13]:
# Cleaning the data set required for pre-processing

# Text to lower
text['Clean_text'] = text1.str.lower().str.replace('[^a-z ]','')
# text['Clean_text'].head()

# Removing stopwords
stop=stopwords.words('english')
def sw(text):
    text=[word for word in str(text).split() if word not in stop]
    return "  ".join(text)

text['text_without_stop']=text['Clean_text'].apply(sw)
# print(text['Clean_text'][1])
# print(text['text_without_stop'][1])

# Checking the modified data set
text[['Issue Experienced','Clean_text','text_without_stop']].head()

Unnamed: 0,Issue Experienced,Clean_text,text_without_stop
0,Not use friendly,not use friendly,use friendly
1,I purchased a jacket T193787.when I got the ja...,i purchased a jacket twhen i got the jacket ho...,purchased jacket twhen got jacket home s...
2,I am looking for briefs for my 15 year old son...,i am looking for briefs for my year old son h...,looking briefs year old son however coul...
3,Please can you re stock item t43/6079 Colour I...,please can you re stock item t colour ivory si...,please stock item colour ivory size cons...
4,"My order has not been delivered , although you...",my order has not been delivered although your...,order delivered although tracking say dis...


In [17]:
# Creating a DTM using TF-IDF Vectorizer
vec = TfidfVectorizer()
DTM = vec.fit_transform(text['text_without_stop'])

# Understand the parameters of lDA
lda = LatentDirichletAllocation(n_components=5, max_iter=10, random_state = 1234)
lda_output = lda.fit_transform(DTM)
# print(lda)
topic_name = ['Topic' + str(i) for i in range(lda.n_components)]
docname = ['Doc' + str(i) for i in range(DTM.shape[0])]
# lda_output

In [24]:
# Building a data frame out of the topics created
df_document_topic = pd.DataFrame(np.round(lda_output,2),columns= topic_name, index = docname)
# df_document_topic.head()

# Dominatung topic
dominating_topic = np.argmax(df_document_topic.values, axis =1)
# dominating_topic
df_document_topic['dominating_topic'] = dominating_topic
# df_document_topic.head()
# df_document_topic.groupby('dominating_topic').size()

df_topic_keywords = pd.DataFrame(lda.components_)
# df_topic_keywords
df_topic_keywords.columns = vec.get_feature_names()
df_topic_keywords.index = topic_name
df_topic_keywords

Unnamed: 0,aa,aanddharveygmailcom,aaron,abandon,abandoned,abandoning,abbot,abcbcdaa,abd,abdomen,...,zillions,zim,zinc,zinder,zip,zipped,zips,zombies,zone,zoom
Topic0,0.202024,0.317356,0.204714,0.200025,1.054194,0.78705,0.769231,0.201407,0.32784,0.200011,...,0.212515,0.610059,0.200013,0.200019,0.204422,0.200023,0.20575,0.202227,0.200015,0.200008
Topic1,0.822118,0.200126,0.371319,0.224999,0.200067,0.200034,0.200035,0.200016,0.556301,0.200029,...,0.200037,0.200076,0.200033,0.200054,0.200052,0.200056,0.200033,0.275684,0.200031,0.200019
Topic2,0.200106,0.200239,0.200027,0.20011,0.200096,0.200058,0.200055,0.52169,0.414257,0.200052,...,0.200066,0.200085,0.200059,0.200076,0.200127,0.400315,0.200064,0.200026,0.200057,0.200034
Topic3,0.200101,0.223418,0.200023,0.200102,0.200103,0.200055,0.200055,0.200028,0.870362,0.200057,...,0.492536,0.200081,0.200055,0.200074,0.200043,0.200102,0.201578,0.200026,0.200054,0.200032
Topic4,0.209618,0.283087,0.201626,0.392831,0.689511,0.200659,0.201425,0.200006,0.20749,0.348632,...,0.200016,0.200022,0.36543,0.640357,2.119057,0.21876,0.815739,0.20204,0.553888,0.944761


In [25]:
def show_topic(vectorize = vec, model = lda, n_words =20):
    keywords = np.array(vec.get_feature_names())
    topic_keywords = []
    for topic_weights in lda.components_:
        topic_keywords_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(topic_keywords_locs))
    return topic_keywords
 
show_topic(vec, lda, 20)

df = pd.DataFrame(show_topic(vec, lda, 30), index = topic_name)
df

[array(['sparks', 'offers', 'card', 'email', 'get', 'order', 'unable',
        'cant', 'tried', 'access', 'page', 'use', 'trying', 'account',
        'offer', 'website', 'cannot', 'time', 'address', 'enter'],
       dtype='<U167'),
 array(['order', 'delivery', 'delivered', 'easy', 'day', 'find', 'website',
        'good', 'store', 'flowers', 'site', 'ordered', 'th', 'offer',
        'free', 'use', 'food', 'pictures', 'collect', 'today'],
       dtype='<U167'),
 array(['vegetarian', 'meal', 'deal', 'dine', 'main', 'options', 'option',
        'mains', 'meat', 'fish', 'menu', 'poor', 'difficult', 'frozen',
        'vegan', 'reach', 'eat', 'gluten', 'crashes', 'confusing'],
       dtype='<U167'),
 array(['slow', 'friendly', 'loading', 'user', 'pages', 'rubbish', 'keeps',
        'terrible', 'filters', 'review', 'hopeless', 'website', 'tick',
        'virtual', 'half', 'work', 'unresponsive', 'extremely', 'jumping',
        'cms'], dtype='<U167'),
 array(['stock', 'size', 'items', 'find', 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Topic0,sparks,offers,card,email,get,order,unable,cant,tried,access,...,find,site,fitting,survey,bra,password,log,wont,points,see
Topic1,order,delivery,delivered,easy,day,find,website,good,store,flowers,...,arrived,gift,birthday,still,complicated,track,ms,days,useless,next
Topic2,vegetarian,meal,deal,dine,main,options,option,mains,meat,fish,...,vegetarians,dissapointing,sports,two,mattress,navigation,course,dishes,yes,authorisation
Topic3,slow,friendly,loading,user,pages,rubbish,keeps,terrible,filters,review,...,responding,photographs,engine,hats,nightdress,appropriate,helpful,knew,table,bloody
Topic4,stock,size,items,find,filters,available,looking,trousers,sizes,buy,...,time,filter,back,wanted,page,cant,look,get,see,long


In [11]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()
def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))
    
sample_Sent = "I just got a call from my boss - does he realise it's saturday??? :("
sample_Sent1 = "The intent behind the movie was great, but it could have been better"
sample_Sent2 = "I hope I could say that I liked the movie"
print_sentiment_scores(sample_Sent2)

I hope I could say that I lekid the movie {'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'compound': 0.4404}


In [22]:
# Spelling correction
# !pip install autocorrect
from autocorrect import Speller
spell = Speller(lang='en')

spell('caaaar'), spell('mussage'), spell('survice'), spell('hte')

('caesar', 'message', 'service', 'the')

In [25]:
# !pip install spellchecker
from spellchecker import SpellChecker
spellcheck = SpellChecker()

# Find those words that may be misspelled 
misspelled = spellcheck.unknown(["cmputr", "watr", "study", "wrte"]) 
  
for word in misspelled: 
    # Get the one `most likely` answer 
    print(spell.correction(word)) 
  
    # Get a list of `likely` options 
    print(spell.candidates(word))

ModuleNotFoundError: No module named 'indexer'

In [28]:
#! pip install TextBlob
from textblob import TextBlob 
  
a = "cmputr"
print("original text: "+str(a)) 
  
b = TextBlob(a) 
print("corrected text: "+str(b.correct()))

original text: cmputr
corrected text: computer
