In [1]:
#imbalance
#normalize to 0-1
#out: M,N,D
#some data doesn't exist in B,C and CD-CI, also CV,CW exclude those null instances

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, nan_euclidean_distances

pd.set_option('display.max_rows', 200)

In [2]:
#create DataFrame
velux_dataset_original = pd.read_csv("sample_data_velux.csv", sep=';')

In [3]:
#check data type
velux_dataset_original.dtypes

final_id                                   object
marketing_consent_optin                    object
marketing_consent_optout                   object
marketing_consent_optout_date              object
number_of_submited_forms                    int64
number_of_unique_project_type               int64
number_of_unique_space_type                 int64
if_intent                                   int64
if_project_intent_changed                   int64
if_space_intent_changed                     int64
project_type                               object
space_type                                 object
first_brproject                            object
first_brspace                              object
if_brproject                                int64
if_brspace                                  int64
number_of_clicked_banners                   int64
number_of_clicked_banners_on_velux          int64
number_of_clicked_banners_outside_velux     int64
number_of_sessions                          int64


In [4]:
#inspect data
cols = list(velux_dataset_original)
toDelete = []
toObject = []

for col in cols:
    if len(velux_dataset_original[col].unique()) == 1 or (len(velux_dataset_original[col].unique()) == 2 and np.NaN in velux_dataset_original[col].unique()):
         toDelete.append(col)
    elif col[0:2] == 'if':
        toObject.append(col)    

print(toDelete)

['number_of_submited_forms', 'number_of_unique_project_type', 'if_intent', 'if_project_intent_changed', 'if_space_intent_changed', 'if_abandoned_form']


In [5]:
#clean the data

#drop cols with only one uniqe value
velux_dataset_original.drop(toDelete, axis='columns',inplace=True)

#drop cols with too little data
velux_dataset = velux_dataset_original.drop(['final_id','first_brproject','first_brspace','marketing_consent_optout_date'], axis='columns')

#convert type of data from int64 to Object where apropriate
for col in toObject:
    velux_dataset[col] = velux_dataset[col].astype(str)


In [6]:
#create small dataset by deleting all rows with any NaN values
velux_dataset_small = velux_dataset.dropna()

#create dataset with synthesied missing values
velux_dataset_synthesised= [] #TODO

#create big dataset by deleting cols with too many missing data
velux_dataset_big = velux_dataset.drop(['most_frequent_page_project_type','most_frequent_page_space_type','second_frequent_page_project_type','second_frequent_page_space_type','third_frequent_page_project_type','third_frequent_page_space_type'], axis='columns')

In [7]:
velux_dataset_small

Unnamed: 0,marketing_consent_optin,marketing_consent_optout,number_of_unique_space_type,project_type,space_type,if_brproject,if_brspace,number_of_clicked_banners,number_of_clicked_banners_on_velux,number_of_clicked_banners_outside_velux,...,number_of_thursday_events,number_of_friday_events,number_of_saturday_events,number_of_hour_1_6_events,number_of_hour_7_12_events,number_of_hour_13_18_events,number_of_hour_19_24_events,if_visited_marketing_domain_flag,tag_project_last_visited_page,tag_space_last_visited_page
89,True,False,0,replacement,no_space_intent,0,0,0,0,0,...,0,0,0,0,7,0,0,1,all projects,allspaces
230,True,False,1,newbuild,allspaces,0,0,2,0,0,...,41,26,23,5,37,52,0,1,all projects,allspaces
351,True,False,1,renovation,allspaces,0,0,0,0,0,...,0,4,0,4,0,12,0,1,all projects,allspaces
471,True,False,1,replacement,livingroom,0,0,0,0,0,...,0,0,17,0,25,0,0,1,all projects,allspaces
477,True,False,1,replacement,bathroom,0,0,8,0,8,...,6,9,18,18,6,29,0,1,all projects,allspaces
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,True,False,1,newbuild,livingroom,0,0,1,0,0,...,0,0,0,0,9,3,0,1,all projects,allspaces
2461,True,False,1,loftconversion,corridororstaircase,0,0,2,0,0,...,12,0,0,0,0,12,0,1,all projects,allspaces
2462,True,False,1,replacement,bedroom,0,0,1,0,0,...,11,0,0,0,0,11,0,1,all projects,allspaces
2463,True,False,1,loftconversion,corridororstaircase,0,0,4,0,0,...,15,0,0,0,0,0,15,1,all projects,allspaces


In [8]:
#once again clean cols with only one value
toDeleteSmall = []

for col in velux_dataset_small:
    if len(velux_dataset_small[col].unique()) == 1 or (len(velux_dataset_small[col].unique()) == 2 and np.NaN in velux_dataset_small[col].unique()):
        toDeleteSmall.append(col)

velux_dataset_small.drop(toDeleteSmall,axis='columns',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  velux_dataset_small.drop(toDeleteSmall,axis='columns',inplace=True)


In [9]:
#check for class imbalance (could use Tomek Links, SMOTE, Metric like F1, and Random Forest algorithm, since tree based algos work well with imbalanced data)
for col in velux_dataset_small:
    print(velux_dataset_small[col].value_counts(), '\n')



marketing_consent_optin
True     182
False     31
Name: count, dtype: int64 

marketing_consent_optout
False    206
True       7
Name: count, dtype: int64 

number_of_unique_space_type
1    198
0     15
Name: count, dtype: int64 

project_type
replacement       103
loftconversion     52
upgrading          30
newbuild           20
renovation          8
Name: count, dtype: int64 

space_type
bedroom                68
livingroom             27
bathroom               27
corridororstaircase    27
allspaces              21
no_space_intent        15
kitchen                12
kidsroom                9
office                  7
Name: count, dtype: int64 

if_brproject
0    205
1      8
Name: count, dtype: int64 

if_brspace
0    210
1      3
Name: count, dtype: int64 

number_of_clicked_banners
0     165
2      25
1      11
4       3
8       2
3       2
5       2
6       1
13      1
7       1
Name: count, dtype: int64 

number_of_clicked_banners_outside_velux
0    212
8      1
Name: count, dtyp

In [10]:
#normalize data
velux_dataset_small_normalized = velux_dataset_small.copy()

for col in velux_dataset_small:
    if velux_dataset_small[col].dtype == 'int64':
        x = velux_dataset_small[col].values
        normalized=(x-x.min())/(x.max()-x.min())
        velux_dataset_small_normalized[col] = normalized


In [13]:
#train classifier and check performance

y = velux_dataset_small['project_type']
X = pd.get_dummies(velux_dataset_small.drop('project_type', axis='columns'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=56) # make a train-test split to train the model properly
knn = RandomForestClassifier() # create a classifier
model = knn.fit(X_train, y_train) # train the classifier
pred = knn.predict(X_test) # test the classifier
print(confusion_matrix(y_test, pred)) # print out a confusion matrix, classification report and accuracy score to check how did the algorithm do.
print(classification_report(y_test, pred))
print(accuracy_score(y_test, pred))

[[ 2  0  0 11  0]
 [ 1  0  0  3  0]
 [ 1  0  0  4  0]
 [ 6  0  0 24  1]
 [ 1  0  0 10  0]]
                precision    recall  f1-score   support

loftconversion       0.18      0.15      0.17        13
      newbuild       0.00      0.00      0.00         4
    renovation       0.00      0.00      0.00         5
   replacement       0.46      0.77      0.58        31
     upgrading       0.00      0.00      0.00        11

      accuracy                           0.41        64
     macro avg       0.13      0.19      0.15        64
  weighted avg       0.26      0.41      0.31        64

0.40625


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
