# Title: CORRELATION MODEL IN THE ADOPTION OF E-PAYMENT SERVICES

## Load Libraries

In [26]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn import svm, tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef, precision_recall_curve, roc_curve

from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr, chi2, chi2_contingency

from xgboost import XGBClassifier

%matplotlib inline

## Load Custom Made Libraries

In [2]:
from Utilities.CFS import *
from Utilities.reliefF import *
from Utilities.accuracy import *
from Utilities.corr_matrix import *
from Utilities.forward_selection import *
from Utilities.backward_elimination import *

from Visualization.model_graph import *
from Visualization.network_graph import *

from skfeature.utility.mutual_information import *

## Load Dataset

In [3]:
df = pd.read_csv("Dataset/E-payment Cryptocurrency Coin.csv")

## Label Binarizer

In [4]:
def convert_nominal(arr, term_arr):
    tmp_dict = {val:ind for (ind, val) in enumerate(term_arr)}
    return arr.map(lambda x : tmp_dict[x])

## Split Dataset

In [5]:
mod_fac_df = pd.DataFrame()

column_arr = ["Age", "Gender", "Marital Status", "Education Level", "Work Industry", "Work Position"]

for (ind, col_name) in enumerate(column_arr):
    mod_fac_df[col_name] = df.iloc[:, 6 + ind]
    
# Replace Values in Work Industry
mod_fac_df = mod_fac_df.replace("Baking / Finance", "Banking / Finance")

# Label Binarize all columns
# Age
mod_fac_df["Age"] = convert_nominal(mod_fac_df["Age"], ["< 25 years", "26 - 40 years", "41 - 55 years", "above 55 years"])

# Gender
mod_fac_df["Gender"] = convert_nominal(mod_fac_df["Gender"], ["Male", "Female"])

# Marital Status
mod_fac_df["Marital Status"] = convert_nominal(mod_fac_df["Marital Status"], ["Single", "Married", "Other"])

# Education Level
mod_fac_df["Education Level"] = convert_nominal(mod_fac_df["Education Level"], ['Primary school', 'Secondary/High school', 'College/university', 'Graduate school', 'Other'])

# Work Industry
mod_fac_df["Work Industry"] = convert_nominal(mod_fac_df["Work Industry"], ['Banking / Finance', 'Education', 'Healthcare', 'Manufacturing', 'Retail / Hypermarket', 'Other'])

# Work Position
mod_fac_df["Work Position"] = convert_nominal(mod_fac_df["Work Position"], ['Junior management', 'Middle management', 'Top management', 'Professional', 'Other'])


In [6]:
utaut_fac_df = pd.DataFrame()

column_arr = df.iloc[:, 28:].columns
column_arr = [col.split(": ")[0] for col in column_arr]

for (ind, col_name) in enumerate(column_arr):
    utaut_fac_df[col_name] = df.iloc[:, 28 + ind]
    utaut_fac_df[col_name] = utaut_fac_df[col_name].map(lambda x : x - 1)
    
# Change Data Type to int
utaut_fac_df = utaut_fac_df.astype(int)

## Target Variables

### BI1: I intend to use Blockchain / Cryptocurrency Coin.

In [7]:
arr_df = [mod_fac_df, utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]

df_X = pd.concat(arr_df, axis = 1)
df_Y = utaut_fac_df.loc[:, "BI1"]

# Convert Values to Int
df_X = df_X.astype(int)
df_Y = df_Y.astype(int)

#### Moderated Variables

In [8]:
# Use SpearmanR as df_Y is an ordinal variable
arr_list = []

for col in mod_fac_df.columns:
    corr, p_val = spearmanr(mod_fac_df.loc[:, col], df_Y)
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
tmp_df = pd.DataFrame(arr_list, columns = ["Moderated Variable", "P-Value"])

tmp_df.style.hide_index()

Moderated Variable,P-Value
Gender,0.014702
Work Position,0.002541


### CFS

In [9]:
arr_df = [mod_fac_df, utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]

tmp_X = pd.concat(arr_df, axis = 1)
tmp_Y = df_Y

feature_set = CFS(tmp_X, tmp_Y, spearmanr)

feature_set

['PE1', 'AT1', 'FC4', 'SE4', 'AX4', 'T1', 'T2', 'T4']

### Weka Based Calculation

In [22]:
arr_df = [mod_fac_df, utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]

tmp_X = pd.concat(arr_df, axis = 1)
tmp_Y = df_Y

tmp_df = pd.DataFrame()

# Chi Square
corr, p_val = chi2(tmp_X, tmp_Y)
tmp_df["Chi-Square"] = corr
tmp_df["Chi-Square (Rank)"] = [*stats.rankdata(tmp_df["Chi-Square"] * -1).astype(int)]

# Information Gain
info_gain_arr = []

for col in tmp_X.columns:
    # Information Gain
    val = information_gain(tmp_X.loc[:, col], tmp_Y)
    info_gain_arr.append(val)
    
tmp_df["Info Gain"] = np.array(info_gain_arr)
tmp_df["Info Gain (Rank)"] = [*stats.rankdata(tmp_df["Info Gain"] * -1).astype(int)]

# ReliefF Algorithm
tmp_df["ReliefF"] = reliefF(tmp_X, tmp_Y)
tmp_df["ReliefF (Rank)"] = [*stats.rankdata(tmp_df["ReliefF"] * -1).astype(int)]

tmp_df.index = tmp_X.columns

tmp_df = tmp_df.sort_values("Chi-Square (Rank)")

In [23]:
tmp_df

Unnamed: 0,Chi-Square,Chi-Square (Rank),Info Gain,Info Gain (Rank),ReliefF,ReliefF (Rank)
T2,66.851217,1,0.668622,4,6737.7459,31
T1,64.886247,2,0.74778,1,7500.583892,16
AT1,63.584937,3,0.697494,2,8446.916456,7
FC2,60.895975,4,0.58834,10,6970.816418,25
PE1,59.070364,5,0.628796,7,8304.271269,9
EE2,58.100144,6,0.501494,20,6810.772602,30
EE3,56.785521,7,0.541375,14,7210.201191,21
T3,56.684381,8,0.655675,5,6820.147232,28
EE4,56.108935,9,0.522568,17,7860.855077,11
AT2,54.604721,10,0.520198,18,7163.301746,22


### Wrapper Based Methods

In [44]:
arr_df = [mod_fac_df, utaut_fac_df.drop(["BI1", "BI2", "BI3", "BI4"], axis = 1)]

tmp_X = pd.concat(arr_df, axis = 1)
tmp_Y = df_Y

model = DecisionTreeClassifier()

feature_set = backward_selection(tmp_X, tmp_Y, model, 8)

feature_set

['PE2', 'EE1', 'EE4', 'AT1', 'SE1', 'T1', 'T3', 'T4']

#### Remove Features that are not significant with Target Variable

In [None]:
arr_list = []

# Get List of P_Values
for col in df_X.columns:
    corr, p_val = spearmanr(df_X.loc[:, col], df_Y)
    # Threshold => Only append variables that have a p_value smaller than 0.05
    if p_val <= 0.05:
        arr_list.append((col, p_val))
        
# Sort Variables Ascending by P_val
arr_list = sorted(arr_list, key = lambda x : x[1])

arr_df = pd.DataFrame(arr_list, columns = ["Variables", "P Value"])

arr_df.style.hide_index()

### Model Object

In [None]:
class ModelObj:
    def __init__(self, model, name, accuracy, clf_report, confusion_matrix, mcc):
        self.model = model
        self.name = name
        self.accuracy = accuracy
        self.clf_report = clf_report
        self.confusion_matrix = confusion_matrix
        self.mcc = mcc

In [None]:
def create_ModelObj(model, name, X, Y, class_arr):
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
    
    # Train Model
    model.fit(X_train, y_train)
    
    # Get Y Predict
    y_pred = model.predict(X_test)
    
    # Accuracy
    acc_score = accuracy_score(y_pred, y_test) * 100.0

    # Classification Report
    tf_dict = { str(ind):val for ind, val in enumerate(class_arr)}
    clf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict = True))
    clf_report.rename(tf_dict, axis = 1, inplace=True)
    clf_report = clf_report.T

    # Confusion Matrix
    tf_dict = { ind:val for ind, val in enumerate(class_arr)}
    confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    confusion_matrix_df.rename(tf_dict, axis = 0, inplace=True)
    confusion_matrix_df.rename(tf_dict, axis = 1, inplace=True)
    
    # Matthew Correlation Coefficient
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return ModelObj(model, name, acc_score, clf_report, confusion_matrix_df, mcc)

In [None]:
model_dict = {}

### Decision Tree

#### Without Feature Selection

In [None]:
tmp_X = df_X.loc[:, arr_df["Variables"]]
tmp_Y = df_Y

model = DecisionTreeClassifier()
name = "Decision Tree"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

#### Decision Tree Visualization

In [None]:
viz = dtreeviz(
    model_dict["Decision Tree"].model, 
    tmp_X, 
    tmp_Y, 
    feature_names = tmp_X.columns, 
    class_names = ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"],
    fancy = False)

In [None]:
viz

#### With Feature Selection (Wrapper Based Method)

In [None]:
tmp_X = df_X.loc[:, feature_set]
tmp_Y = df_Y

model = DecisionTreeClassifier()
name = "Decision Tree (CFS)"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

### Random Forest (Bagging Method)

#### Without Feature Selection

In [None]:
tmp_X = df_X.loc[:, arr_df["Variables"]]
tmp_Y = df_Y

model = RandomForestClassifier()
name = "Random Forest"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

#### With Feature Selection (CFS Based)

In [None]:
tmp_X = df_X.loc[:, feature_set]
tmp_Y = df_Y

model = RandomForestClassifier()
name = "Random Forest (CFS)"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

### XGBoost (Boosting Method)

#### Without Feature Selection

In [None]:
tmp_X = df_X.loc[:, arr_df["Variables"]]
tmp_Y = df_Y

model = XGBClassifier(eval_metric='error', use_label_encoder=False)
name = "XGBoost"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

#### With Feature Selection (CFS Based)

In [None]:
tmp_X = df_X.loc[:, feature_set]
tmp_Y = df_Y

model = XGBClassifier(eval_metric='error', use_label_encoder=False)
name = "XGBoost (CFS)"

acc_score = get_acc_score_kcv(tmp_X, tmp_Y, model)

print(f"Accuracy Score: {round(acc_score, 2)}%")

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"])

## Plot Graph

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df_X.loc[:, arr_df["Variables"]], df_Y, test_size = 0.2)

### Result

In [None]:
m_arr = [(name, model_dict[name].clf_report, model_dict[name].mcc) for name in model_dict]
cmp_result_tbl(m_arr, "weighted avg")

### Precision

In [None]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "Precision")
pfr_graph(tmp_df, "Model", "Score", "Precison Comparison")

### Recall

In [None]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "Recall")
pfr_graph(tmp_df, "Model", "Score", "Recall Comparison")

### F1-Score

In [None]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "F1-Score")
pfr_graph(tmp_df, "Model", "Score", "F1-Score Comparison")

### Accuracy

In [None]:
acc_arr = [(key, model_dict[key].accuracy) for key in model_dict]
acc_graph(acc_arr, "Accuracy Score Comparison", "Accuracy Score", "Types of Model")