In [40]:
%matplotlib inline

# generic packages
import sys
import re, numpy as np, pandas as pd
from pprint import pprint

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

# graphing, vis stuff
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models

# gensim for topic modelling
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel


from collections import Counter


In [41]:
def prettify(df): # make tables pretty and easier to scan
    cell_hover = {
        'selector': 'td:hover',
        'props': [('background-color', '#ffffb3')]
    }
    
    headers = {
        'selector': 'th:not(.index_name)',
        'props': 'text-align: left; font-size: 1.8rem; font-family: Helvetica;'
    }
    cells = {
        'selector': 'td',
        'props': 'text-align: left; font-size: 1.4rem; padding:10px 30px 10px 10px;'
    }
    return df.style.set_table_styles([cell_hover, headers, cells]).hide_index()

def c_to_df(counter):
        df = (pd.DataFrame.from_dict(counter, orient='index')
              .reset_index()
              .sort_values(0, ascending=False)
              .dropna())
        return(df)

### Step 1: Load Data

In [42]:
raw_csv = pd.read_csv("bitdefender_vpn_customer_responses.csv")
cols_to_keep = ['Device',
                'Browser',
                'OS',
                'How did you first hear about Bitdefender Premium VPN ?',
                'What specifically made you decide you needed Bitdefender Premium VPN in your life? What triggered this decision for you? ',
                'What ONE pain or problem has Bitdefender Premium VPN eliminated or lessened for you the most?',
                'What ONE benefit have you valued most from using Bitdefender Premium VPN so far?',
                'Which, if any, alternatives did you consider before deciding on Bitdefender Premium VPN?',
                'Why did you ultimately choose Bitdefender Premium VPN over other options?',
                'What 3 adjectives would you use to describe Bitdefender Premium VPN?',
                'On a scale of 0 to 10, how happy are you with Bitdefender Premium VPN? ',
                'What is the main reason for your rating? (Please be as specific as possible) ',
                'Do you currently use any OTHER Bitdefender products or plans (e.g. security/antivirus, identity protection, etc.)?',
                'If need be, would it be OK to follow up by email to hear more or help with issues you\'re having?']

raw_csv = raw_csv[cols_to_keep]
for idx, question in enumerate(raw_csv.columns):
    print(idx, question)

0 Device
1 Browser
2 OS
3 How did you first hear about Bitdefender Premium VPN ?
4 What specifically made you decide you needed Bitdefender Premium VPN in your life? What triggered this decision for you? 
5 What ONE pain or problem has Bitdefender Premium VPN eliminated or lessened for you the most?
6 What ONE benefit have you valued most from using Bitdefender Premium VPN so far?
7 Which, if any, alternatives did you consider before deciding on Bitdefender Premium VPN?
8 Why did you ultimately choose Bitdefender Premium VPN over other options?
9 What 3 adjectives would you use to describe Bitdefender Premium VPN?
10 On a scale of 0 to 10, how happy are you with Bitdefender Premium VPN? 
11 What is the main reason for your rating? (Please be as specific as possible) 
12 Do you currently use any OTHER Bitdefender products or plans (e.g. security/antivirus, identity protection, etc.)?
13 If need be, would it be OK to follow up by email to hear more or help with issues you're having?


### Step 2: Set Up Initial Topic Model (LDA)

In [45]:
raw_csv2 = raw_csv[raw_csv.iloc[:,10] < 6]

df = pd.DataFrame(raw_csv2.iloc[:,9]) # choose col of text to analyse

df.columns = ["content"]
df = df.dropna(subset=['content']).reset_index()
output_filename = "top_benefit_lda_analysis.csv"

data = df.content.values.tolist()

data

['Nuisance irritating invasive',
 "Don't know what an adjective is",
 'Certain amount of privacy. Unfortunately many sites now do not respond when VPN is active',
 'Pease see my previous epistle',
 'safe, effective, convinent',
 'Quiet, in background.',
 'Seems to perform well, not good for wifi security cameras or online banking.  Drops 5g signal',
 'Inconsistent',
 'Pardon the expression - shithouse.  Your development team have never been able to help me.',
 'At this time, they would not be favorable.',
 'Supportive help with downloading.\nCourteous help with challenges when phone needed.\nPertinent updates.',
 "Don't really have that much experience with it.",
 'SLOW, SLOW, SLOW.',
 'Slow. Slow. Slow.',
 "Haven't used it yet. What would be helpful is to create short videos showing how to use features of Bitdefender and Password Manager showing how to use it and all options.",
 "The app is fine.  I can't compliment a company who can't provide the honesty and mobile security/safety to

In [46]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub(r'\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub(r'\s+', ' ', sent)  # remove newline chars
        sent = re.sub(r"\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)
        
data_words = list(sent_to_words(data))

In [47]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

# extra words to omit
stop_words.extend(['bitdefender','vpn'])

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams, and Lemmatize"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

#data_ready = process_words(data_words)  # processed Text Data!

In [48]:
from collections import Counter

def process_adj(texts, stop_words=stop_words, allowed_postags=['ADJ']):
    """Remove Stopwords, Form Bigrams, Trigrams, and Lemmatize"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    
    # flatten list of lists into single list & return
    flattened_list = [item for sublist in texts_out for item in sublist]
    return(flattened_list)

adj_list = process_adj(data_words)
c = Counter(adj_list)
adj_df = c_to_df(c).rename(columns={"index":"Adjective", 0: "Mentions"})
adj_df.to_csv("unhappy_cleaned_adjectives.csv")

In [27]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# create shorthand of running LDA model
def generate_models(n_topic):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=n_topic, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=500,
                                           per_word_topics=True)
    return lda_model

### Step 3: Find Model With Best Coherence & Perplexity Score 

Lower the perplexity better the model.
Higher the topic coherence, the topic is more human interpretable.

In [28]:
topic_range = list(np.arange(3,31,2)) # [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
all_models = [generate_models(n_topic) for n_topic in topic_range]
model_df = pd.DataFrame({'n_topics': topic_range, 
                         'LDA_model': all_models})

In [29]:
def score_coherence(lda_model):
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, texts=data_ready, dictionary=id2word, coherence='c_v')
    return coherence_model_lda.get_coherence()

def score_perplexity(lda_model):
    return lda_model.log_perplexity(corpus) 


In [30]:
model_df['coherence_score'] = model_df['LDA_model'].apply(lambda x: score_coherence(x))
model_df['perplexity_score'] = model_df['LDA_model'].apply(lambda x: score_perplexity(x))


In [31]:
best_model = model_df.iloc[model_df['coherence_score'].idxmax()]
lda_model = best_model['LDA_model']


### Step 4: Isolate The Dominant "Topic/Theme" Per Survey Response

In [33]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)


# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']


### Step 5: Map Back To Original Texts, Keep Only Strong, Single-Topic Responses

In [39]:
lda_clustered_df = df_dominant_topic.merge(df, left_index=True, right_index=True).drop(['index'], axis=1)
lda_clustered_df = lda_clustered_df.sort_values(["Dominant_Topic",'Topic_Perc_Contrib'],ascending=False)

# remove entries that have Perc_Contribution of < 0.5
lda_clustered_df = lda_clustered_df[lda_clustered_df['Topic_Perc_Contrib'] >= 0.5]
lda_clustered_df.to_csv(output_filename)

In [38]:
# calculate sample coverage - how many responses were included in the final theme summary?
included_docs = len(set(lda_clustered_df['Document_No']))
total_docs = len(df)
coverage = (included_docs / total_docs)
print("% Documents Included: ", coverage)

% Documents Included:  0.2673037938024906
