In [1]:
%matplotlib inline

In [2]:
# global libraries & presets
import warnings
warnings.filterwarnings('ignore')

import sys, os
import numpy as np
import pandas as pd
import string
import re
from pprint import pprint
from collections import Counter

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 1000)

# Frame the problem and look at the big picture

# Get the data

In [3]:
import scrapy
from bs4 import BeautifulSoup

# pull in all csvs in working directory
all_files = os.listdir("./")    
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
all_dfs = {}

# save them all to dict
for i, file in enumerate(csv_files):
    all_dfs[i] = {'name' : file,
                  'data' : pd.read_csv(file)}
    print("File "+ str(i) + ": " + file)

# grab the data you want with all_df[i]['data'] ...
input_df = all_dfs[0]['data']


File 0: bitdefender_vpn_customer_responses.csv
File 1: bitdefender_vpn_customer_responses-extra.csv


# Explore the data

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={'figure.figsize':(10, 6)})

# save long col names to a code table
def create_varcode_table(input_df):
    colname_map = {}
    for i, colname in enumerate(input_df.columns):
        colname_map['VAR_'+str(i)] = colname

    # substitute colnames to make easier to handle
    new_cols = [ 'VAR_'+str(i) for i, col in enumerate(input_df.columns) ]  
    input_df.columns = new_cols
    
    return colname_map, input_df

colnames, df = create_varcode_table(input_df)


##### 1. Check for empty & zero variance columns and remove them

# check each variable for variation
def inspect_var_levels(df):
    var = []
    levels = []
    for col in df:
        var.append(col)
        levels.append(df[col].value_counts().count())

    col_counts = pd.DataFrame({"variable": var, "levels": levels})
    return col_counts.sort_values(by="levels")

# function to reassign dtypes
def set_dtypes(df, dtypes):
    for var in dtypes:
        df[var] = df[var].astype(dtypes[var])
    return df

# view variation of each variable
inspect_var_levels(df)

# assess values by eye
df.head(25)

# list of cols to drop
cols_to_drop = ["VAR_0", "VAR_1", "VAR_4", "VAR_8", "VAR_9"]

# remove vars with < 2 levels
df = df.drop(cols_to_drop, axis=1)

##### 2. Check for high-null value cols

#df.info()

##### 3. assign correct dtypes
dtypes = {
    'VAR_2': 'datetime64',
    'VAR_3': 'category',
    'VAR_5': 'category',
    'VAR_6': 'category', 
    'VAR_7': 'category'}

df = set_dtypes(df, dtypes)

# check variation of data

df.describe(include="category")

Unnamed: 0,VAR_3,VAR_5,VAR_6,VAR_7
count,4088,4088,4088,4088
unique,53,3,103,60
top,United States,desktop,Chrome 96.0.4664,Windows 10
freq,2043,2732,966,2115


In [43]:
##### 4. Identify combined-type object variables (aka half open-text, half-category)

def check_for_mixed_type_cols(df):
    for col in df.select_dtypes("object"):
        print("Variable: ", col)
        print(df[col].value_counts())
        print(" ")
    
#check_for_mixed_type_cols(df)
    
mixed_vars = df[["VAR_10", "VAR_14", "VAR_19", "VAR_20"]]
text_only_vars = df.select_dtypes("object").drop(mixed_vars, axis=1)

In [69]:
###### 5. Process mixed type categories (need to make a code table for each)

# for each mixed type var, assign separator string
#check_for_mixed_type_cols(mixed_vars)

# separate & cluster checkbox options first, then save into original df with 'category' dtype assigned
split_var = mixed_vars["VAR_14"].str.split(" \| ", expand = False).to_list()

# find the longest list item and assign its items as category labels (will be used in OHE to create a sparse matrix)



Unnamed: 0,VAR_10,VAR_14,VAR_19,VAR_20
0,A friend told me about it,None - Bitdefender was my first choice,,
1,I received an email about it,None - Bitdefender was my first choice,"yes, I also use ... (click here to type) - ant...",Yes (click here to type email address) - jean...
2,I received an email about it,None - Bitdefender was my first choice,"yes, I also use ... (click here to type) - Sec...",Yes (click here to type email address) - kjmo...
3,I saw an ad and clicked it,None - Bitdefender was my first choice,"yes, I also use ... (click here to type) - Sec...",No
4,I received an email about it,Nord VPN | Express VPN,"yes, I also use ... (click here to type) - sec...",No
...,...,...,...,...
4083,Other (click here to type) - Bitdefender's web...,,,
4084,I saw an ad and clicked it,,,
4085,A friend told me about it,None - Bitdefender was my first choice,no,No
4086,I received an email about it,None - Bitdefender was my first choice,"yes, I also use ... (click here to type) - TS",No


# Prepare the data (transform, feature selection, etc.)

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# Explore & shortlist models


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# gensim for topic modelling
import gensim,logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel


# Fine-tune the system (parameter tuning)

In [7]:
from sklearn.model_selection import GridSearchCV

# Present solution

# Launch