In [74]:
%matplotlib inline

# generic packages
import sys
import re, numpy as np, pandas as pd
from pprint import pprint

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 1)

import warnings
warnings.filterwarnings("ignore")

# graphing, vis stuff
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim_models

# gensim for topic modelling
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# additional libraries
from collections import Counter

In [215]:
# add custom definitions here

def prettify(df): # make tables pretty and easier to scan
    cell_hover = {
        'selector': 'td:hover',
        'props': [('background-color', '#ffffb3')]
    }
    
    headers = {
        'selector': 'th:not(.index_name)',
        'props': 'text-align: left; font-size: 1.8rem; font-family: Helvetica;'
    }
    cells = {
        'selector': 'td',
        'props': 'text-align: left; font-size: 1.4rem; padding:10px 30px 10px 10px;'
    }
    return df.style.set_table_styles([cell_hover, headers, cells]).hide_index()

def split_cat_from_text(df, colname, separator):
    cat_and_open_df = (df[colname]
                  .str.replace(separator, '%%', regex=False)
                  .str.split('%%', expand=True)
                  .rename(columns={0: colname}))
    cat_df = pd.DataFrame(cat_and_open_df.iloc[:,0])
    open_df = pd.DataFrame(cat_and_open_df.iloc[:,1])
    return cat_and_open_df, cat_df, open_df


def count_category(cat_df, colname):
    col_summary = (pd.DataFrame(cat_df.groupby(colname).size())
                       .rename(columns={0:'Count'})
                       .sort_values("Count", ascending=False)
                       .reset_index())
    
    col_total = col_summary['Count'].sum()
    col_summary["% of Sample"] = col_summary['Count'].apply(lambda x: x/col_total*100)
    
    return col_summary


def c_to_df(counter):
        df = (pd.DataFrame.from_dict(counter, orient='index')
              .reset_index()
              .sort_values(0, ascending=False)
              .dropna())
        return(df)
        
        
def split_checkbox(cat_df, colname, multi_separator):
    category = cat_df[colname].str.split(multi_separator, expand=True)
                
    melted_cat = (pd.melt(category).drop("variable", axis=1)
                .replace(to_replace=[None, "NaN", ""], value=np.nan))
    melted_cat = Counter(melted_cat["value"])
    return melted_cat

In [216]:
q2_colname = 'Which, if any, alternatives did you consider before deciding on Bitdefender Premium VPN?'
q2_separator = '(click here to tyoe) - '
q2_multi_separator = ' | '

# step 1 - separate cat and open text
q2_cat_and_open, q2_cat, q2_open = split_cat_from_text(all_users, q2_colname, q2_separator)

# step 2 - get and save open text as csv
q2_other = q2_cat_and_open[q2_cat_and_open[q2_colname] == "Other "]
q2_other.to_csv("q2_other_texts.csv")

# Run LDA analysis and then clean up in Google Docs w/ VOC Analysis Kit ...

    #q2_other_cleaned = pd.read_csv("q2_other_cleaned.csv") # load human-cleaned version
    #q2_other_count = count_category(q1_other_cleaned, "Category").rename(columns={'Category':'"Other" Theme'})
    #prettify(q2_other_count.head())

# step 3 - split cats and melt
q2_melted_cat = split_checkbox(q2_cat, q2_colname, q2_multi_separator)

q2_melted_cat 
# step 4 - count up & prettify melted cats

Counter({'None': 2053,
         'Nord': 701,
         'Express': 415,
         'Other': 584,
         nan: 47627,
         'Cyberghost': 104,
         '-': 2053,
         'VPN': 1220,
         'Bitdefender': 2053,
         '|': 386,
         'was': 2053,
         'my': 2053,
         'first': 2053,
         'choice': 2053})

In [191]:
# load and filter data for analysis ...

raw_csv = pd.read_csv("bitdefender_vpn_customer_responses.csv")

cols_to_keep = ['How did you first hear about Bitdefender Premium VPN ?',
                'Which, if any, alternatives did you consider before deciding on Bitdefender Premium VPN?',
                'What 3 adjectives would you use to describe Bitdefender Premium VPN?',
                'On a scale of 0 to 10, how happy are you with Bitdefender Premium VPN? ',
                'Do you currently use any OTHER Bitdefender products or plans (e.g. security/antivirus, identity protection, etc.)?',
                'If need be, would it be OK to follow up by email to hear more or help with issues you\'re having?']

all_users = raw_csv[cols_to_keep]


# <u>Question 1: How Users First Heard About Premium VPN</u>

In [192]:
q1_colname = 'How did you first hear about Bitdefender Premium VPN ?'
q1_separator ='(click here to type) -'

q1_cat_and_open, q1_cat, q1_open = split_cat_from_text(all_users, q1_colname, q1_separator)
q1_summary = count_category(q1_cat, q1_colname)

# edit the table copy to make more readable
q1_summary.at[0, q1_colname] = "Received email"
q1_summary.at[2, q1_colname] = "Googled / searched"
q1_summary.at[3, q1_colname] = "Clicked an ad"
q1_summary.at[5, q1_colname] = "Read about it online"

# merge 'read about it on ...' and 'heard suggested in comment on ...' totals
total = q1_summary.iloc[5:7]["Count"].sum() 
q1_summary.at[5, "Count"] = total
q1_summary.drop(index=6, inplace=True)

prettify(q1_summary)

How did you first hear about Bitdefender Premium VPN ?,Count,% of Sample
Received email,1085,26.5
Other,1015,24.8
Googled / searched,622,15.2
Clicked an ad,481,11.8
A friend told me about it,456,11.2
Read about it online,429,9.2


In [217]:
# what do people say with 'other'?
q1_cat_and_open[q1_cat_and_open[q1_colname] == "Other "]
q1_other.to_csv("q1_other_texts.csv")

# Run LDA analysis and then clean up in Google Docs w/ VOC Analysis Kit ...

q1_other_cleaned = pd.read_csv("q1_other_cleaned.csv") # load human-cleaned version
#q1_other_count = count_category(q1_other_cleaned, "Category").rename(columns={'Category':'"Other" Theme'})

prettify(q1_other_cleaned.head())

Other - Open Text,Category
"I went to Consumer Reports, PC Mag, and other major reviews.",consumer reports
Don’t remember. Maybe an on line review,don't remember
Consumer Report.,consumer reports
consumer report,consumer reports
Consumer Reports,consumer reports


# Question 2: Which VPN Competitors Did Customers Consider?

In [None]:

alternatives = (all_users[q2_colname]
                .str.replace(q2_multi_separator, '%%', regex=False)
                .str.replace(q2_separator,'', regex=False)
                .str.split('%%', expand=True)).reset_index()

alternatives.rename(columns = {0: 'Competitor_1', 1: 'Competitor_2', 2: 'Competitor_3', 3: 'Competitor_4'}, inplace=True)

alternatives = pd.wide_to_long(alternatives, stubnames=['Competitor'], i='index', j='choice', sep='_')


In [None]:
alternatives["Competitor"] = (alternatives["Competitor"]
                              .str.replace(r'[^\w\s]+', '')
                              .str.lower()
                              .str.replace('vpn', '', regex=False)
                              .str.strip())

In [None]:
q2_summary = count_category(alternatives, 'Competitor')
prettify(q2_summary.head(10))