# Title: CORRELATION MODEL IN THE ADOPTION OF E-PAYMENT SERVICES

## Load Libraries

In [46]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import svm, tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, KFold
from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef

%matplotlib inline

## Load Custom Made Libraries

In [2]:
from Utilities.CFS import *
from Utilities.accuracy import *
from Utilities.corr_matrix import *
from Utilities.forward_selection import *
from Utilities.backward_elimination import *

from Visualization.model_graph import *
from Visualization.network_graph import *

## Load Dataset

In [3]:
df = pd.read_csv("Dataset/E-payment Cryptocurrency Coin.csv")

## Label Binarizer

In [4]:
def convert_nominal(arr, term_arr):
    tmp_dict = {val:ind for (ind, val) in enumerate(term_arr)}
    return arr.map(lambda x : tmp_dict[x])

## Split Dataset

In [5]:
mod_fac_df = pd.DataFrame()

column_arr = ["Age", "Gender", "Marital Status", "Education Level", "Work Industry", "Work Position"]

for (ind, col_name) in enumerate(column_arr):
    mod_fac_df[col_name] = df.iloc[:, 6 + ind]
    
# Replace Values in Work Industry
mod_fac_df = mod_fac_df.replace("Baking / Finance", "Banking / Finance")

# Label Binarize all columns
# Age
mod_fac_df["Age"] = convert_nominal(mod_fac_df["Age"], ["< 25 years", "26 - 40 years", "41 - 55 years", "above 55 years"])

# Gender
mod_fac_df["Gender"] = convert_nominal(mod_fac_df["Gender"], ["Male", "Female"])

# Marital Status
mod_fac_df["Marital Status"] = convert_nominal(mod_fac_df["Marital Status"], ["Single", "Married", "Other"])

# Education Level
mod_fac_df["Education Level"] = convert_nominal(mod_fac_df["Education Level"], ['Primary school', 'Secondary/High school', 'College/university', 'Graduate school', 'Other'])

# Work Industry
mod_fac_df["Work Industry"] = convert_nominal(mod_fac_df["Work Industry"], ['Banking / Finance', 'Education', 'Healthcare', 'Manufacturing', 'Retail / Hypermarket', 'Other'])

# Work Position
mod_fac_df["Work Position"] = convert_nominal(mod_fac_df["Work Position"], ['Junior management', 'Middle management', 'Top management', 'Professional', 'Other'])


In [6]:
utaut_fac_df = pd.DataFrame()

column_arr = df.iloc[:, 28:].columns
column_arr = [col.split(": ")[0] for col in column_arr]

for (ind, col_name) in enumerate(column_arr):
    utaut_fac_df[col_name] = df.iloc[:, 28 + ind]
    
# Change Data Type to int
utaut_fac_df = utaut_fac_df.astype(int)

## Correlation Network Graph

In [27]:
# Create Pairwise Correlation Matrix
corr_df = pairwise_correlation(utaut_fac_df, pearsonr)

# "Winner Takes All Method"
threshold = 0.75

# Retain Features where correlation is above threshold
corr_df = corr_df.loc[abs(corr_df["correlation"]) >= threshold]

network_graph(corr_df, "Important Features")

## Target Variables

### Have you ever purchased anything using the E-payment mode?

In [32]:
df_Y = df.iloc[:, 17]

# Replace Values
df_Y = df_Y.replace("Yes", 1)
df_Y = df_Y.replace("No", 0)

# Convert Values to Int
df_Y = df_Y.astype(int)

df_Y.name

'6. E-payment purchasing, Loyalty Points and Crypto Coin [(1) Have you ever purchased anything using the E-payment mode?]'

#### Moderated Variables

In [35]:
# Use PointBiserialR as there are only 2 Classes
arr_list = []

for col in mod_fac_df.columns:
    corr, p_val = pointbiserialr(mod_fac_df.loc[:, col], df_Y)
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
tmp_df = pd.DataFrame(arr_list, columns = ["Moderated Variable", "P-Value"])

tmp_df.style.hide_index()

Moderated Variable,P-Value
Education Level,0.000754


#### Remove Features that are not significant with Target Variable

In [36]:
pd_arr = [mod_fac_df.loc[:, ["Education Level", "Work Position"]], utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]
df_X = pd.concat(pd_arr, axis = 1)

# Correlation Function
arr_list = []

# Get List of P_Values
for col in df_X.columns:
    corr, p_val = spearmanr(df_X.loc[:, col], df_Y)
    # Threshold => Only append variables that have a p_value smaller than 0.05
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
# Sort Variables Descending by P_val
arr_list = sorted(arr_list, key = lambda x : x[1])

arr_df = pd.DataFrame(arr_list, columns = ["Variables", "P Value"])

arr_df.style.hide_index()

Variables,P Value
Education Level,0.00036
T4,0.001337
FC4,0.009788
SE4,0.010777
T2,0.015056
T1,0.018241
SE3,0.020097
SE2,0.028713
AT1,0.039143
T3,0.039644


#### Decision Tree

In [47]:
df_X = df_X.loc[:, arr_df["Variables"]]
df_Y = df_Y

model = DecisionTreeClassifier()

acc_score = get_acc_score_kcv(df_X, df_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

Accuracy Score: 80.12%


#### Decision Tree (With Feature Selection)

In [44]:
# Default Value
num_of_features = [*range(5, df_X.shape[1])]
arr_list = []

# Declare Model
model = DecisionTreeClassifier()

for num in num_of_features:
    # Get Number of Features
    feature_set = forward_selection(df_X.loc[:, arr_df["Variables"]], df_Y, model, num)
    
    # Get Accuracy Score From Cross Validation
    acc_score = get_acc_score_kcv(df_X.loc[:, feature_set], df_Y, model)
    
    arr_list.append((num, ", ".join(feature_set), acc_score))
    
tmp_df = pd.DataFrame(arr_list, columns = ["Features Num", "Feature Set", "Accuracy"])

tmp_df.style.hide_index()

Features Num,Feature Set,Accuracy
5,"SE4, T2, SE3, SE2, T3",86.724138
6,"SE4, T2, T1, SE3, SE2, AT2",83.559113
7,"T4, FC4, T2, T1, SE2, AT1, AT2",81.453202
8,"T4, SE4, T2, T1, SE2, AT1, T3, AT2",83.87931
9,"T4, FC4, SE4, T2, T1, SE3, SE2, AT1, T3",84.630542
10,"T4, FC4, SE4, T2, T1, SE3, SE2, AT1, T3, AT2",82.573892


### In the next six months, do you plan to purchase anything using the E-payment mode?

In [58]:
df_Y = df.iloc[:, 18]

# Replace Values
df_Y = df_Y.replace("Yes", 1)
df_Y = df_Y.replace("No", 0)

# Convert Values to Int
df_Y = df_Y.astype(int)

df_Y.name

'6. E-payment purchasing, Loyalty Points and Crypto Coin [(2) In the next six months, do you plan to purchase anything using the E-payment mode?]'

#### Moderated Variables

In [59]:
# Use PointBiserialR as there are only 2 Classes
arr_list = []

for col in mod_fac_df.columns:
    corr, p_val = pointbiserialr(mod_fac_df.loc[:, col], df_Y)
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
tmp_df = pd.DataFrame(arr_list, columns = ["Moderated Variable", "P-Value"])

tmp_df.style.hide_index()

Moderated Variable,P-Value
Education Level,0.000652
Work Position,0.000884


#### Remove Features that are not significant with Target Variable

In [60]:
pd_arr = [mod_fac_df.loc[:, ["Education Level", "Work Position"]], utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]
df_X = pd.concat(pd_arr, axis = 1)

# Correlation Function
arr_list = []

# Get List of P_Values
for col in df_X.columns:
    corr, p_val = spearmanr(df_X.loc[:, col], df_Y)
    # Threshold => Only append variables that have a p_value smaller than 0.05
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
# Sort Variables Descending by P_val
arr_list = sorted(arr_list, key = lambda x : x[1])

arr_df = pd.DataFrame(arr_list, columns = ["Variables", "P Value"])

arr_df.style.hide_index()

SyntaxError: invalid syntax (<ipython-input-60-443565460fda>, line 11)

#### Decision Tree

In [47]:
df_X = df_X.loc[:, arr_df["Variables"]]
df_Y = df_Y

model = DecisionTreeClassifier()

acc_score = get_acc_score_kcv(df_X, df_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

Accuracy Score: 80.12%


#### Decision Tree (With Feature Selection)

In [44]:
# Default Value
num_of_features = [*range(5, df_X.shape[1])]
arr_list = []

# Declare Model
model = DecisionTreeClassifier()

for num in num_of_features:
    # Get Number of Features
    feature_set = forward_selection(df_X.loc[:, arr_df["Variables"]], df_Y, model, num)
    
    # Get Accuracy Score From Cross Validation
    acc_score = get_acc_score_kcv(df_X.loc[:, feature_set], df_Y, model)
    
    arr_list.append((num, ", ".join(feature_set), acc_score))
    
tmp_df = pd.DataFrame(arr_list, columns = ["Features Num", "Feature Set", "Accuracy"])

tmp_df.style.hide_index()

Features Num,Feature Set,Accuracy
5,"SE4, T2, SE3, SE2, T3",86.724138
6,"SE4, T2, T1, SE3, SE2, AT2",83.559113
7,"T4, FC4, T2, T1, SE2, AT1, AT2",81.453202
8,"T4, SE4, T2, T1, SE2, AT1, T3, AT2",83.87931
9,"T4, FC4, SE4, T2, T1, SE3, SE2, AT1, T3",84.630542
10,"T4, FC4, SE4, T2, T1, SE3, SE2, AT1, T3, AT2",82.573892


### BI1: I intend to use Blockchain / Cryptocurrency Coin.

In [20]:
df_Y = utaut_fac_df.loc[:, "BI1"]

df_Y.name

'BI1'

#### Moderated Variables

In [21]:
# Use PointBiserialR as there are only 2 Classes
arr_list = []

for col in mod_fac_df.columns:
    corr, p_val = pointbiserialr(mod_fac_df.loc[:, col], df_Y)
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
tmp_df = pd.DataFrame(arr_list, columns = ["Moderated Variable", "P-Value"])

tmp_df.style.hide_index()

Moderated Variable,P-Value
Work Position,0.002978


### BI4: I plan to use Blockchain / Cryptocurrency Coin for E-payment transaction to buy stuff.

In [22]:
df_Y = utaut_fac_df.loc[:, "BI4"]

df_Y.name

'BI4'

#### Moderated Variables

In [23]:
# Use PointBiserialR as there are only 2 Classes
arr_list = []

for col in mod_fac_df.columns:
    corr, p_val = pointbiserialr(mod_fac_df.loc[:, col], df_Y)
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
tmp_df = pd.DataFrame(arr_list, columns = ["Moderated Variable", "P-Value"])

tmp_df.style.hide_index()

Moderated Variable,P-Value
Work Position,0.004845
