# Title: CORRELATION MODEL IN THE ADOPTION OF E-PAYMENT SERVICES

## Load Libraries

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import ipywidgets as ipw
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.subplots import make_subplots

from sklearn import svm, tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, precision_recall_curve, roc_curve

import scipy.stats as stats
from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr, chi2, chi2_contingency

from xgboost import XGBClassifier

from dtreeviz.trees import *

%matplotlib inline

## Load Custom Made Libraries

In [2]:
from Utilities.CFS import *
from Utilities.reliefF import *
from Utilities.accuracy import *
from Utilities.corr_matrix import *
from Utilities.forward_selection import *
from Utilities.backward_elimination import *
from Utilities.prob_distributed_feature import *

from Visualization.model_graph import *
from Visualization.network_graph import *

from skfeature.utility.mutual_information import *

## Load Dataset

In [3]:
df = pd.read_csv("Dataset/E-payment Cryptocurrency Coin.csv")

## Label Binarizer

In [4]:
def convert_nominal(arr, term_arr):
    tmp_dict = {val:ind for (ind, val) in enumerate(term_arr)}
    return arr.map(lambda x : tmp_dict[x])

## Split Dataset

In [5]:
index_dict = {
    "Age": ["< 25 years", "26 - 40 years", "41 - 55 years", "above 55 years"],
    "Gender": ["Male", "Female"],
    "Marital Status": ["Single", "Married", "Other"],
    "Education Level": ['Primary school', 'Secondary/High school', 'College/university', 'Graduate school', 'Other'],
    "Work Industry": ['Banking / Finance', 'Education', 'Healthcare', 'Manufacturing', 'Retail / Hypermarket', 'Other'],
    "Work Position": ['Junior management', 'Middle management', 'Top management', 'Professional', 'Other']
}

In [6]:
mod_fac_df = pd.DataFrame()

column_arr = ["Age", "Gender", "Marital Status", "Education Level", "Work Industry", "Work Position"]

for (ind, col_name) in enumerate(column_arr):
    mod_fac_df[col_name] = df.iloc[:, 6 + ind]
    
# Replace Values in Work Industry
mod_fac_df = mod_fac_df.replace("Baking / Finance", "Banking / Finance")

# Label Binarize all columns
for col in column_arr:
    mod_fac_df[col] = convert_nominal(mod_fac_df[col], index_dict[col])

In [7]:
utaut_fac_df = pd.DataFrame()

column_arr = df.iloc[:, 28:].columns
column_arr = [col.split(": ")[0] for col in column_arr]

for (ind, col_name) in enumerate(column_arr):
    utaut_fac_df[col_name] = df.iloc[:, 28 + ind]
    utaut_fac_df[col_name] = utaut_fac_df[col_name].map(lambda x : x - 1)
    
# Change Data Type to int
utaut_fac_df = utaut_fac_df.astype(int)

In [8]:
tmp_mod_fac_df = pd.DataFrame()

column_arr = ["Age", "Gender", "Marital Status", "Education Level", "Work Industry", "Work Position"]

for (ind, col_name) in enumerate(column_arr):
    tmp_mod_fac_df[col_name] = df.iloc[:, 6 + ind]
    
tmp_mod_fac_df = tmp_mod_fac_df.replace("Baking / Finance", "Banking / Finance")

all_df_arr = [tmp_mod_fac_df, utaut_fac_df, df.iloc[:, [17, 18]]]

all_df = pd.concat(all_df_arr, axis = 1)
all_df.columns = all_df.columns.tolist()[:-2] + [col.split("[")[1][4:-1] for col in all_df.columns[-2:]]

In [9]:
all_df

Unnamed: 0,Age,Gender,Marital Status,Education Level,Work Industry,Work Position,PE1,PE2,PE3,PE4,...,T1,T2,T3,T4,BI1,BI2,BI3,BI4,Have you ever purchased anything using the E-payment mode?,"In the next six months, do you plan to purchase anything using the E-payment mode?"
0,< 25 years,Female,Single,College/university,Banking / Finance,Other,1,1,1,1,...,1,0,1,2,2,2,2,2,Yes,Yes
1,< 25 years,Female,Single,College/university,Other,Other,0,0,0,0,...,1,1,1,1,1,1,1,1,Yes,Yes
2,41 - 55 years,Female,Single,College/university,Manufacturing,Middle management,3,4,3,4,...,2,2,2,3,2,3,3,3,Yes,Yes
3,< 25 years,Male,Single,College/university,Education,Other,2,2,2,2,...,2,2,2,2,3,3,2,2,No,Yes
4,< 25 years,Female,Single,College/university,Other,Other,2,2,3,3,...,2,3,3,3,3,4,3,3,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,41 - 55 years,Male,Married,Secondary/High school,Other,Other,2,2,2,2,...,2,2,2,2,2,2,2,2,Yes,Yes
282,above 55 years,Male,Married,Graduate school,Education,Top management,2,2,2,2,...,2,2,2,2,2,2,2,2,Yes,Yes
283,above 55 years,Female,Married,College/university,Other,Other,2,2,2,2,...,2,2,2,2,2,2,2,2,Yes,Yes
284,41 - 55 years,Male,Married,Graduate school,Education,Professional,3,3,3,3,...,3,3,3,3,3,3,3,3,Yes,Yes


## Get Unique Variables

In [10]:
def df_unique(arr):
    tmp_keys = []
    for val in arr:
        if type(val) != str:
            continue
        tmp_list = val.split(";")
        for val2 in tmp_list:
            if val2 not in tmp_keys:
                tmp_keys.append(val2)
                
    arr_list = []
    for val in arr:
        if type(val) != str:
            continue
        tmp_dict = {i:0 for i in tmp_keys}
        tmp_list = val.split(";")
        for val2 in tmp_list:
            tmp_dict[val2] += 1
        arr_list.append(tmp_dict)
    
    return pd.DataFrame(arr_list)

## Target Variable

In [11]:
# Allow Researcher to Select Which Unique Value to Filter By

#### DataFrame for Variables to Predict Against

In [12]:
tmp_y_df = pd.DataFrame()

Do you own any of the following? (Own Electronic Payment)

In [13]:
tmp_col = df.iloc[:, 12]
tmp_df = df_unique(tmp_col)

tmp_df.iloc[:, 2] = pd.concat([tmp_df.iloc[:, 2], tmp_df.iloc[:, -1]], axis = 1).max(axis=1)

# Convert tmp_df to Sum
tmp_dict = {col : tmp_df[col].sum() for col in tmp_df.columns[:-1]}

for ind in range(tmp_df.shape[1] - 1):
    tmp_col = tmp_df.iloc[:, ind]
    tmp_y_df[tmp_col.name] = tmp_col

Have you made any electronic payments in the past 12 months? (Type of Electronic Payment)

In [14]:
tmp_col = df.iloc[:, 14]
tmp_df = df_unique(tmp_col)

tmp_df.iloc[:, 2] = pd.concat([tmp_df.iloc[:, 2], tmp_df.iloc[:, -1]], axis = 1).max(axis=1)

for ind in range(tmp_df.shape[1] - 2):
    tmp_col = tmp_df.iloc[:, ind]
    tmp_y_df[tmp_col.name] = tmp_col

In [15]:
tmp_col = df.iloc[:, 14]
tmp_df = df_unique(tmp_col)

tmp_s = tmp_df["No"]

tmp_s = tmp_s.map(lambda x : 1 if x == 0 else 0)

tmp_s = tmp_s.rename("Have you ever used E-Payment Before?")

# Store into tmp_y_df
tmp_y_df[tmp_s.name] = tmp_s

In [16]:
tmp_df = df.iloc[:, [17, 18]]

# Replace Values 
tmp_df = tmp_df.replace("No", 0)
tmp_df = tmp_df.replace("Yes", 1)

# Convert to INT
tmp_df = tmp_df.astype(int)

# Change Columns Name
tmp_df.columns = [col.split("[")[1][4:-1] for col in tmp_df.columns.tolist()]

arr_df = [tmp_y_df, tmp_df]

tmp_y_df = pd.concat(arr_df, axis = 1)

tmp_y_df

Unnamed: 0,Mobile Smartphone,"Bank Cards (Credit, Debit, Pre-paid)",Touch n Go,Internet Services (e.g: Broadband),E-wallet account (E.g: MOL or PayPal),"Internet of Things gadget (e.g: Fitbit – measure steps, etc)",Blockchain / Cryptocurrency Coin,HealthCare Gadget (E.g: Blood pressure measure device etc),Yes (Mobile),"Yes (Bank Cards (i.e. Credit, Debit, Pre-paid))",Yes (Touch n Go),Yes (Internet Services (e.g: Broadband)),Yes (E-wallet account (E.g: MOL or PayPal)),Yes (Blockchain / Cryptocurrency Coin solutions),Have you ever used E-Payment Before?,Have you ever purchased anything using the E-payment mode?,"In the next six months, do you plan to purchase anything using the E-payment mode?"
0,1,1,1,1,1,0,0,0,1,1,1,1,1,0,1,1,1
1,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,1,1
2,1,1,1,1,1,1,1,0,0,1,1,1,1,0,1,1,1
3,1,1,1,1,0,1,0,0,1,1,1,1,0,0,1,0,1
4,1,1,1,1,1,0,1,0,1,1,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,1,1,1,1,0,0,0,0,1,1,1,0,0,0,1,1,1
282,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,1,1
283,1,1,0,1,0,0,0,0,1,1,0,1,0,0,1,1,1
284,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1


## Select Important Features (CFS Method)

In [23]:
def select_by_CFS_Union(X, Y):
    func_arr = [pearsonr, spearmanr, pointbiserialr]
    
    final_arr = []
    for func in func_arr:
        feature_set = CFS(X, Y, func)
        final_arr += feature_set
        
    return set(final_arr)

Have you ever made payments using Mobile Smartphone Before?

In [18]:
# df_X = utaut_fac_df
# df_Y = tmp_y_df.loc[:, "Yes (Mobile)"]

# select_by_CFS(df_X, df_Y)

{'AX3', 'AX4', 'EE1', 'FC4', 'SE2', 'SE4'}

Have you ever made payments using Bank Cards Before?

In [19]:
# df_X = utaut_fac_df
# df_Y = tmp_y_df.loc[:, "Yes (Bank Cards (i.e. Credit, Debit, Pre-paid))"]

# select_by_CFS(df_X, df_Y)

{'AX3', 'AX4', 'FC3', 'SI1'}

Have you ever used E-Payment Before?

In [20]:
# df_X = utaut_fac_df
# df_Y = tmp_y_df.loc[:, "Have you ever used E-Payment Before?"]

# select_by_CFS(df_X, df_Y)

{'AX4', 'PE2'}

Have you ever purchased anything using the E-payment mode?

### Feature Overlap

In [None]:
func_arr = [pearsonr, spearmanr, pointbiserialr]

feature_dict = {}

for func in func_arr:
    feature_set = CFS(X, Y, func)
    for feat in feature_set:
        if feat in feature_dict:
            feature_dict[feat] += 1
        else:
            feature_dict[feat] = 0

### Feature Not Overlap

### Feature Union

In [21]:
df_X = utaut_fac_df
df_Y = tmp_y_df.loc[:, "Have you ever purchased anything using the E-payment mode?"]

select_by_CFS(df_X, df_Y)

{'AX2', 'AX4', 'BI1', 'FC4', 'SE4', 'T4'}

In the next six months, do you plan to purchase anything using the E-payment mode?

In [22]:
# df_X = utaut_fac_df
# df_Y = tmp_y_df.loc[:, "In the next six months, do you plan to purchase anything using the E-payment mode?"]

# select_by_CFS(df_X, df_Y)

{'AX2', 'BI1', 'FC4', 'T4'}