# Title: CORRELATION MODEL IN THE ADOPTION OF E-PAYMENT SERVICES

## Load Libraries

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import ipywidgets as ipw
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from plotly.subplots import make_subplots

from sklearn import svm, tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, precision_recall_curve, roc_curve

import scipy.stats as stats
from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr, chi2, chi2_contingency

from xgboost import XGBClassifier

from dtreeviz.trees import *

%matplotlib inline

## Load Custom Made Libraries

In [2]:
from Utilities.CFS import *
from Utilities.reliefF import *
from Utilities.accuracy import *
from Utilities.corr_matrix import *
from Utilities.forward_selection import *
from Utilities.backward_elimination import *
from Utilities.prob_distributed_feature import *

from Visualization.model_graph import *
from Visualization.network_graph import *

from skfeature.utility.mutual_information import *

## Load Dataset

In [3]:
df = pd.read_csv("Dataset/E-payment Cryptocurrency Coin.csv")

## Label Binarizer

In [4]:
def convert_nominal(arr, term_arr):
    tmp_dict = {val:ind for (ind, val) in enumerate(term_arr)}
    return arr.map(lambda x : tmp_dict[x])

## Split Dataset

In [5]:
index_dict = {
    "Age": ["< 25 years", "26 - 40 years", "41 - 55 years", "above 55 years"],
    "Gender": ["Male", "Female"],
    "Marital Status": ["Single", "Married", "Other"],
    "Education Level": ['Primary school', 'Secondary/High school', 'College/university', 'Graduate school', 'Other'],
    "Work Industry": ['Banking / Finance', 'Education', 'Healthcare', 'Manufacturing', 'Retail / Hypermarket', 'Other'],
    "Work Position": ['Junior management', 'Middle management', 'Top management', 'Professional', 'Other']
}

In [22]:
mod_fac_df = pd.DataFrame()

column_arr = ["Age", "Gender", "Marital Status", "Education Level", "Work Industry", "Work Position"]

for (ind, col_name) in enumerate(column_arr):
    mod_fac_df[col_name] = df.iloc[:, 6 + ind]
    
# Replace Values in Work Industry
mod_fac_df = mod_fac_df.replace("Baking / Finance", "Banking / Finance")

# Label Binarize all columns
for col in column_arr:
    mod_fac_df[col] = convert_nominal(mod_fac_df[col], index_dict[col])

In [23]:
utaut_fac_df = pd.DataFrame()

column_arr = df.iloc[:, 28:].columns
column_arr = [col.split(": ")[0] for col in column_arr]

for (ind, col_name) in enumerate(column_arr):
    utaut_fac_df[col_name] = df.iloc[:, 28 + ind]
    utaut_fac_df[col_name] = utaut_fac_df[col_name].map(lambda x : x - 1)
    
# Change Data Type to int
utaut_fac_df = utaut_fac_df.astype(int)

In [24]:
all_df_arr = [mod_fac_df, utaut_fac_df]

all_df = pd.concat(all_df_arr, axis = 1)

all_df

Unnamed: 0,Age,Gender,Marital Status,Education Level,Work Industry,Work Position,PE1,PE2,PE3,PE4,...,AX3,AX4,T1,T2,T3,T4,BI1,BI2,BI3,BI4
0,0,1,0,2,0,4,1,1,1,1,...,3,2,1,0,1,2,2,2,2,2
1,0,1,0,2,5,4,0,0,0,0,...,3,3,1,1,1,1,1,1,1,1
2,2,1,0,2,3,1,3,4,3,4,...,3,2,2,2,2,3,2,3,3,3
3,0,0,0,2,1,4,2,2,2,2,...,2,2,2,2,2,2,3,3,2,2
4,0,1,0,2,5,4,2,2,3,3,...,3,3,2,3,3,3,3,4,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,2,0,1,1,5,4,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
282,3,0,1,3,1,2,2,2,2,2,...,3,2,2,2,2,2,2,2,2,2
283,3,1,1,2,5,4,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
284,2,0,1,3,1,3,3,3,3,3,...,2,2,3,3,3,3,3,3,3,3


## Get Filter Variables (Convert List of Variables to Unique Variables)

### Get Unique Variables

In [25]:
def df_unique(arr):
    tmp_keys = []
    for val in arr:
        if type(val) != str:
            continue
        tmp_list = val.split(";")
        for val2 in tmp_list:
            if val2 not in tmp_keys:
                tmp_keys.append(val2)
                
    arr_list = []
    for val in arr:
        if type(val) != str:
            continue
        tmp_dict = {i:"No" for i in tmp_keys}
        tmp_list = val.split(";")
        for val2 in tmp_list:
            tmp_dict[val2] = "Yes"
        arr_list.append(tmp_dict)
    
    return pd.DataFrame(arr_list)

#### DataFrame for Variables to Predict Against

In [26]:
filter_df = pd.DataFrame()

Do you own any of the following? (Own Electronic Payment)

In [27]:
tmp_col = df.iloc[:, 12]
tmp_df = df_unique(tmp_col)

tmp_df.iloc[:, 2] = pd.concat([tmp_df.iloc[:, 2], tmp_df.iloc[:, -1]], axis = 1).max(axis=1)

# Convert tmp_df to Sum
tmp_dict = {col : tmp_df[col].sum() for col in tmp_df.columns[:-1]}

for ind in range(tmp_df.shape[1] - 1):
    tmp_col = tmp_df.iloc[:, ind]
    filter_df[tmp_col.name] = tmp_col
    
filter_df

Unnamed: 0,Mobile Smartphone,"Bank Cards (Credit, Debit, Pre-paid)",Touch n Go,Internet Services (e.g: Broadband),E-wallet account (E.g: MOL or PayPal),"Internet of Things gadget (e.g: Fitbit – measure steps, etc)",Blockchain / Cryptocurrency Coin,HealthCare Gadget (E.g: Blood pressure measure device etc)
0,Yes,Yes,Yes,Yes,Yes,No,No,No
1,Yes,Yes,Yes,Yes,No,No,No,No
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No
3,Yes,Yes,Yes,Yes,No,Yes,No,No
4,Yes,Yes,Yes,Yes,Yes,No,Yes,No
...,...,...,...,...,...,...,...,...
281,Yes,Yes,Yes,Yes,No,No,No,No
282,Yes,Yes,No,Yes,No,No,No,No
283,Yes,Yes,No,Yes,No,No,No,No
284,Yes,Yes,Yes,Yes,No,No,Yes,No


Have you made any electronic payments in the past 12 months? (Type of Electronic Payment) (Unique)

In [28]:
# tmp_col = df.iloc[:, 14]
# tmp_df = df_unique(tmp_col)

# tmp_df.iloc[:, 2] = pd.concat([tmp_df.iloc[:, 2], tmp_df.iloc[:, -1]], axis = 1).max(axis=1)

# for ind in range(tmp_df.shape[1] - 2):
#     tmp_col = tmp_df.iloc[:, ind]
#     tmp_y_df[tmp_col.name] = tmp_col

Have you made any electronic payments in the past 12 months? (Type of Electronic Payment) (Combine all into 1 Variable)

In [29]:
tmp_col = df.iloc[:, 14]
tmp_df = df_unique(tmp_col)

tmp_s = tmp_df["No"]

tmp_s = tmp_s.map(lambda x : "Yes" if x == "No" else "No")

tmp_s = tmp_s.rename("Have you made any electronic payments in the past 12 months?")

# Store into tmp_y_df
filter_df[tmp_s.name] = tmp_s

Have you ever purchased anything using the E-payment mode?\
In the next six months, do you plan to purchase anything using the E-payment mode?

In [30]:
tmp_df = df.iloc[:, [17, 18]]

# # Replace Values 
# tmp_df = tmp_df.replace("No", 0)
# tmp_df = tmp_df.replace("Yes", 1)

# # Convert to INT
# tmp_df = tmp_df.astype(int)

# Change Columns Name
tmp_df.columns = [col.split("[")[1][4:-1] for col in tmp_df.columns.tolist()]

arr_df = [filter_df, tmp_df]

filter_df = pd.concat(arr_df, axis = 1)

In [31]:
filter_df

Unnamed: 0,Mobile Smartphone,"Bank Cards (Credit, Debit, Pre-paid)",Touch n Go,Internet Services (e.g: Broadband),E-wallet account (E.g: MOL or PayPal),"Internet of Things gadget (e.g: Fitbit – measure steps, etc)",Blockchain / Cryptocurrency Coin,HealthCare Gadget (E.g: Blood pressure measure device etc),Have you made any electronic payments in the past 12 months?,Have you ever purchased anything using the E-payment mode?,"In the next six months, do you plan to purchase anything using the E-payment mode?"
0,Yes,Yes,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes
1,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes
2,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes
3,Yes,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes
4,Yes,Yes,Yes,Yes,Yes,No,Yes,No,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...
281,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes
282,Yes,Yes,No,Yes,No,No,No,No,Yes,Yes,Yes
283,Yes,Yes,No,Yes,No,No,No,No,Yes,Yes,Yes
284,Yes,Yes,Yes,Yes,No,No,Yes,No,Yes,Yes,Yes


## Step 1: Choose Variable to Filter

#### Create temporary Dataframe with Variable to Filter

In [32]:
filter_var = "Have you ever purchased anything using the E-payment mode?"

arr_df = [all_df, filter_df.loc[:, [filter_var]]]

tmp_df = pd.concat(arr_df, axis = 1)

# Filter only 'Yes' Result
tmp_df = tmp_df[tmp_df[filter_var] == 'Yes']

tmp_df

Unnamed: 0,Age,Gender,Marital Status,Education Level,Work Industry,Work Position,PE1,PE2,PE3,PE4,...,AX4,T1,T2,T3,T4,BI1,BI2,BI3,BI4,Have you ever purchased anything using the E-payment mode?
0,0,1,0,2,0,4,1,1,1,1,...,2,1,0,1,2,2,2,2,2,Yes
1,0,1,0,2,5,4,0,0,0,0,...,3,1,1,1,1,1,1,1,1,Yes
2,2,1,0,2,3,1,3,4,3,4,...,2,2,2,2,3,2,3,3,3,Yes
4,0,1,0,2,5,4,2,2,3,3,...,3,2,3,3,3,3,4,3,3,Yes
5,1,0,1,2,2,3,3,2,2,2,...,3,2,2,2,2,2,2,2,2,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,2,0,1,1,5,4,2,2,2,2,...,2,2,2,2,2,2,2,2,2,Yes
282,3,0,1,3,1,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,Yes
283,3,1,1,2,5,4,2,2,2,2,...,2,2,2,2,2,2,2,2,2,Yes
284,2,0,1,3,1,3,3,3,3,3,...,2,3,3,3,3,3,3,3,3,Yes


## Step 2 : Select Target Variable (Moderated Factor)

Targeted variable => Age

In [41]:
df_X = tmp_df.loc[:, utaut_fac_df.columns]
df_Y = tmp_df.loc[:, "Age"]

# Convert Values to Int
df_X = df_X.astype(int)
df_Y = df_Y.astype(int)

#### Remove Features that are not significant with Target Variable

In [45]:
arr_list = []

prob = 0.95

# Get List of P_Values
for col in df_X.columns:
    chi_df = pd.crosstab(df_X.loc[:, col], df_Y)
    stat, p, dof, expected = chi2_contingency(chi_df)
    
    critical = chi2.ppf(prob, dof)
    
    alpha = 1 - prob
        
    if abs(stat) >= critical and p <= alpha:
        arr_list.append((col, stat))
        
# Sort Variables Ascending by P_val
arr_list = sorted(arr_list, key = lambda x : x[1])

arr_df = pd.DataFrame(arr_list, columns = ["Variables", "Chi-Square Value"])

arr_df.style.hide_index()

Variables,Chi-Square Value
FC4,22.556139
SE3,22.87069
BI1,23.1271
PE4,23.354385
EE2,23.529619
T2,23.754333
AT1,24.290979
BI2,25.006277
SI1,25.133856
EE4,26.58861


### Merit Based Ranking

In [46]:
arr_list = []

tmp_X = df_X
tmp_Y = df_Y

for col_ind in range(tmp_X.shape[1]):
    name = tmp_X.columns[col_ind]
    merit = merit_calculation(tmp_X.iloc[:, [col_ind]], tmp_Y, pointbiserialr)
    arr_list.append((col_ind ,name, round(merit, 4)))
    
# Sort Column By Merit Value
arr_list = sorted(arr_list, key = lambda x : x[2], reverse = True)

mbf_df = pd.DataFrame(arr_list, index = [i + 1 for i in range(len(arr_list))], columns = ["Rank" ,"Variables", "Merit"])

mbf_df["Rank"] = [ind + 1 for ind in range(mbf_df.shape[0])]

mbf_df.style.hide_index()

Rank,Variables,Merit
1,SI2,0.1716
2,AT3,0.1386
3,SI1,0.1129
4,SE3,0.1093
5,FC4,0.1025
6,T4,0.0913
7,SI3,0.0886
8,T3,0.0881
9,PE1,0.0863
10,EE3,0.0704


## Step 3: CFS

#### Feature Intersection