In [1]:
%matplotlib inline

In [2]:
# global libraries & presets
import warnings
warnings.filterwarnings('ignore')

import sys, os
import numpy as np
import pandas as pd
import string
import re
from pprint import pprint
from collections import Counter

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)

# Frame the problem and look at the big picture

# Get the data

In [8]:
import scrapy
from bs4 import BeautifulSoup

# pull in all csvs in working directory
all_files = os.listdir("./")    
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
all_dfs = {}

# save them all to dict
for i, file in enumerate(csv_files):
    all_dfs[i] = {'name' : file,
                  'data' : pd.read_csv(file)}
    print("File "+ str(i) + ": " + file)

# grab the data you want with all_df[i]['data'] ...
input_df = all_dfs[0]['data']


File 0: bitdefender_vpn_customer_responses.csv
File 1: bitdefender_vpn_customer_responses-extra.csv


# Explore the data

In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={'figure.figsize':(10, 6)})

# save long col names to a code table
def create_varcode_table(input_df):
    colname_map = {}
    for i, colname in enumerate(input_df.columns):
        colname_map['VAR_'+str(i)] = colname

    # substitute colnames to make easier to handle
    new_cols = [ 'VAR_'+str(i) for i, col in enumerate(input_df.columns) ]  
    input_df.columns = new_cols
    
    return colname_map, input_df

colnames, df = create_varcode_table(input_df)

# remove unnecessary columns

# assign correct data types

df.head()

Unnamed: 0,VAR_0,VAR_1,VAR_2,VAR_3,VAR_4,VAR_5,VAR_6,VAR_7,VAR_8,VAR_9,VAR_10,VAR_11,VAR_12,VAR_13,VAR_14,VAR_15,VAR_16,VAR_17,VAR_18,VAR_19,VAR_20
0,4088,0,2022-01-26 20:42:55,United States,https://surveys.hotjar.com/55a4efda-e0de-46a9-...,tablet,Chrome Mobile WebView 88.0.4324,Android 9,00000000-0000-0000-0000-000000000000,,A friend told me about it,Security and privacy is one of the most import...,"I'm still new to all this, yet to determine",Knowing someone's watching out for me.,None - Bitdefender was my first choice,Because my well informed friend is happy with ...,,,,,
1,4087,0,2022-01-26 19:55:12,United States,https://surveys.hotjar.com/55a4efda-e0de-46a9-...,desktop,Firefox 96.0,Windows 10,00000000-0000-0000-0000-000000000000,,I received an email about it,I had heard about VPNs and how they provide mo...,"I'm not specifically aware of a problem, but I...","Again, I'm not specifically aware of a benefit...",None - Bitdefender was my first choice,I have Bitdefender for my antivirus program,"easy to use, but sometimes obtrusive",6.0,It sometimes interferes with my online experie...,"yes, I also use ... (click here to type) - ant...",Yes (click here to type email address) - jean...
2,4086,0,2022-01-26 17:16:22,United States,https://surveys.hotjar.com/55a4efda-e0de-46a9-...,phone,Chrome Mobile 97.0.4692,Android 10,00000000-0000-0000-0000-000000000000,,I received an email about it,"Too many scams, hackers and other bad stuff ou...",Phishing,Easy to use except when banking,None - Bitdefender was my first choice,Good reviews. Many people tell me virus prote...,"Convenient, automatic, safe",7.0,VPN will never work when doing online banking....,"yes, I also use ... (click here to type) - Sec...",Yes (click here to type email address) - kjmo...
3,4085,0,2022-01-26 12:22:22,United Kingdom,https://surveys.hotjar.com/55a4efda-e0de-46a9-...,desktop,Safari 15.2,Mac OS X 10.15.6,00000000-0000-0000-0000-000000000000,,I saw an ad and clicked it,Just to have an added level of security,None it just makes me feel as if I have done a...,Feeling more secure,None - Bitdefender was my first choice,Because I use Bitdefender antivirus as it mini...,Quick to function\nReliable\nEasy,9.0,See the three adjectives,"yes, I also use ... (click here to type) - Sec...",No
4,4084,0,2022-01-26 11:40:58,Canada,https://surveys.hotjar.com/55a4efda-e0de-46a9-...,desktop,Edge 97.0.1072,Windows 10,00000000-0000-0000-0000-000000000000,,I received an email about it,Online privacy is important to me. I was subs...,Sorry. No problems experienced. Limiting the...,Privacy. It is disturbing how many outfits tr...,Nord VPN | Express VPN,Trust.,"Sorry, again. The VPN services I have used ar...",9.0,I have had no issues with Bitdefender.,"yes, I also use ... (click here to type) - sec...",No


# Prepare the data (transform, feature selection, etc.)

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Explore & shortlist models


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# gensim for topic modelling
import gensim,logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel


# Fine-tune the system (parameter tuning)

In [12]:
from sklearn.model_selection import GridSearchCV

# Present solution

# Launch