# Title: CORRELATION MODEL IN THE ADOPTION OF E-PAYMENT SERVICES

## Load Libraries

In [1]:
import time
import numpy as np
import pandas as pd

from scipy.stats import pearsonr, spearmanr, kendalltau, pointbiserialr
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef

## Load Custom Made Libraries

In [2]:
from Utilities.CFS import *
from Utilities.accuracy import *
from Utilities.corr_matrix import *
from Utilities.utils import *

from Visualization.model_graph import *
from Visualization.network_graph import *

## Load Dataset

In [3]:
df = pd.read_csv("../Dataset/train_test_df.csv")

In [4]:
df

Unnamed: 0,Q2. Gender,PE1: I find Blockchain / Cryptocurrency Coin useful in me.,PE2: Using Blockchain / Cryptocurrency Coin enables me to accomplish tasks more quickly.,PE3: Using Blockchain / Cryptocurrency Coin increases my productivity.,PE4: Using Blockchain / Cryptocurrency Coin increases my chances of getting more choices.,EE1: My interaction with Blockchain / Cryptocurrency Coin is clear and understandable.,EE2: It is easy for me to become skillful at using Blockchain / Cryptocurrency Coin.,EE3: I find Blockchain / Cryptocurrency Coin easy to use.,EE4: Learning to operate Blockchain / Cryptocurrency Coin is easy for me.,AT1: Using Blockchain / Cryptocurrency Coin is a good idea.,...,AX3: I hesitate to use Blockchain / Cryptocurrency Coin for fear of making mistakes I cannot correct.,AX4: Blockchain / Cryptocurrency Coin is somewhat intimidating to me.,T1: I feel comfortable using Blockchain / Cryptocurrency Coin.,T2: I feel the reliability to use Blockchain / Cryptocurrency Coin.,T3: I am glad about the service quality to use Blockchain / Cryptocurrency Coin.,T4: Blockchain / Cryptocurrency Coin Integrity is vital to me.,BI1: I intend to use Blockchain / Cryptocurrency Coin.,BI2: I plan to invest Blockchain / Cryptocurrency Coin.,BI3: I plan to have Blockchain / Cryptocurrency Coin as an E-Wallet.,BI4: I plan to use Blockchain / Cryptocurrency Coin for E-payment transaction to buy stuff.
0,Female,2,2,2,2,3,3,3,3,4,...,4,3,2,1,2,3,3,3,3,3
1,Female,1,1,1,1,1,1,1,1,3,...,4,4,2,2,2,2,2,2,2,2
2,Female,4,5,4,5,3,2,2,2,4,...,4,3,3,3,3,4,3,4,4,4
3,Female,3,3,4,4,3,4,3,4,5,...,4,4,3,4,4,4,4,5,4,4
4,Male,4,3,3,3,4,3,3,3,3,...,3,4,3,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,Male,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
248,Male,3,3,3,3,3,3,3,3,3,...,4,3,3,3,3,3,3,3,3,3
249,Female,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
250,Male,4,4,4,4,4,4,4,4,4,...,3,3,4,4,4,4,4,4,4,4


1. Split Dataset to df_X, df_Y
2. Convert Targeted Variable to Nominal Variable
3. CFS
4. SMOTE Imbalanced
5. Train-Test-Model

### Split Dataset

In [5]:
df_Y = df.iloc[:, 0]
df_X = df.iloc[:, 1:]

### Convert Nominal

In [6]:
df_Y_unique_arr = df_Y.unique().tolist()
df_Y = convert_nominal(df_Y, df_Y_unique_arr)

# Convert df_Y into Int Datatype
df_Y = df_Y.astype(int)

### Remove Features that are not significant with Target Variable

In [7]:
from scipy.stats import chi2, chi2_contingency

arr_list = []

prob = 0.95

# Get List of P_Values
for col in df_X.columns:
    chi_df = pd.crosstab(df_X.loc[:, col], df_Y)
    stat, p, dof, expected = chi2_contingency(chi_df)
    
    critical = chi2.ppf(prob, dof)
    
    alpha = 1 - prob
        
    if abs(stat) >= critical and p <= alpha:
        arr_list.append((col, stat))
        
# Sort Variables Ascending by P_val
arr_list = sorted(arr_list, key = lambda x : x[1])

arr_df = pd.DataFrame(arr_list, columns = ["Variables", "Chi-Square Value"])

In [8]:
if arr_df.shape[0] > 4:
    df_X = df_X.loc[:, arr_df["Variables"]]
    df_Y = df_Y

### CFS

In [9]:
func_arr = [pearsonr, spearmanr, pointbiserialr]

tmp_X = df_X
tmp_Y = df_Y

cfs_dict = {}

#### Feature Intersection

In [10]:
corr_dict = {}

name = "inter_feature_set"
corr_dict["feature_set"] = []
corr_dict["corr_dict"] = {}

inter_feature_set = CFS_Intersection(tmp_X, tmp_Y, func_arr)

print(inter_feature_set)

if len(inter_feature_set) > 0:
    tmp_df = df_X.loc[:, inter_feature_set]
    
    tmp_df.columns = [name.split(":")[0] for name in tmp_df.columns]
    corr_df = create_corr_metric_matrix(tmp_df, pearsonr)

    corr_df = corr_df.where(np.tril(np.ones(corr_df.shape)).astype(np.bool))
    
    corr_df = corr_df.replace({np.nan: None})

    display(corr_df)
    
    corr_dict["feature_set"] = list(inter_feature_set)
    corr_dict["corr_dict"] = corr_df.to_dict()
    
cfs_dict[name] = corr_dict

['SE1: I can complete a job or task using Blockchain / Cryptocurrency Coin , if there is no one around to tell me what to do.', 'EE1: My interaction with Blockchain / Cryptocurrency Coin is clear and understandable.', 'FC3: Blockchain / Cryptocurrency Coin is compatible with other systems I use.']


Unnamed: 0,SE1,EE1,FC3
SE1,1.0,,
EE1,0.671106,1.0,
FC3,0.536561,0.58373,1.0


#### Feature Non Intersection

In [11]:
corr_dict = {}

name = "non_inter_feature_set"
corr_dict["feature_set"] = []
corr_dict["corr_dict"] = {}

non_inter_feature_set = CFS_Non_Intersection(tmp_X, tmp_Y, func_arr)

print(non_inter_feature_set)

if len(non_inter_feature_set) > 0:
    tmp_df = df_X.loc[:, non_inter_feature_set]
    
    tmp_df.columns = [name.split(":")[0] for name in tmp_df.columns]
    corr_df = create_corr_metric_matrix(tmp_df, pearsonr)

    corr_df = corr_df.where(np.tril(np.ones(corr_df.shape)).astype(np.bool))
    
    corr_df = corr_df.replace({np.nan: None})

    display(corr_df)
    
    corr_dict["feature_set"] = list(non_inter_feature_set)
    corr_dict["corr_dict"] = corr_df.to_dict()
    
cfs_dict[name] = corr_dict

[]


#### Feature Union

In [12]:
name = "union_feature_set"

corr_dict = {}
corr_dict["feature_set"] = []
corr_dict["corr_dict"] = {}

union_feature_set = CFS_Union(tmp_X, tmp_Y, func_arr)

print(union_feature_set)

if len(union_feature_set) > 0:
    tmp_df = df_X.loc[:, union_feature_set]
    
    tmp_df.columns = [name.split(":")[0] for name in tmp_df.columns]
    corr_df = create_corr_metric_matrix(tmp_df, pearsonr)

    corr_df = corr_df.where(np.tril(np.ones(corr_df.shape)).astype(np.bool))
    
    corr_df = corr_df.replace({np.nan: None})
    
    display(corr_df)
    
    corr_dict["feature_set"] = list(union_feature_set)
    corr_dict["corr_dict"] = corr_df.to_dict()

cfs_dict[name] = corr_dict

{'SE1: I can complete a job or task using Blockchain / Cryptocurrency Coin , if there is no one around to tell me what to do.', 'FC3: Blockchain / Cryptocurrency Coin is compatible with other systems I use.', 'EE1: My interaction with Blockchain / Cryptocurrency Coin is clear and understandable.'}


Unnamed: 0,SE1,FC3,EE1
SE1,1.0,,
FC3,0.536561,1.0,
EE1,0.671106,0.58373,1.0


In [13]:
import json

with open("res_data/corr_dataFrame.json", 'w') as fout:
    json_dumps_str = json.dumps(cfs_dict, indent=4)
    print(json_dumps_str, file=fout)

In [14]:
print(cfs_dict)

{'inter_feature_set': {'feature_set': ['SE1: I can complete a job or task using Blockchain / Cryptocurrency Coin , if there is no one around to tell me what to do.', 'EE1: My interaction with Blockchain / Cryptocurrency Coin is clear and understandable.', 'FC3: Blockchain / Cryptocurrency Coin is compatible with other systems I use.'], 'corr_dict': {'SE1': {'SE1': 1.0, 'EE1': 0.6711056982203533, 'FC3': 0.5365611986075159}, 'EE1': {'SE1': None, 'EE1': 1.0, 'FC3': 0.5837298750514555}, 'FC3': {'SE1': None, 'EE1': None, 'FC3': 1.0}}}, 'non_inter_feature_set': {'feature_set': [], 'corr_dict': {}}, 'union_feature_set': {'feature_set': ['SE1: I can complete a job or task using Blockchain / Cryptocurrency Coin , if there is no one around to tell me what to do.', 'FC3: Blockchain / Cryptocurrency Coin is compatible with other systems I use.', 'EE1: My interaction with Blockchain / Cryptocurrency Coin is clear and understandable.'], 'corr_dict': {'SE1': {'SE1': 1.0, 'FC3': 0.5365611986075159, 'E

### SMOTE Imbalanced

In [15]:
from imblearn.over_sampling import SMOTE

# Check if Targeted Variable have any imbalanced Class
tmp_Y = df_Y

imb_df = pd.DataFrame(index = df_Y_unique_arr)

imb_df["Count"] = tmp_Y.value_counts().sort_index().set_axis(df_Y_unique_arr)

imb_df["Count (%)"] = round(imb_df["Count"] / imb_df["Count"].sum() * 100.0, 2)

if min(imb_df["Count"]) > len(df_Y_unique_arr) * 2:
    oversample = SMOTE()
    df_X, df_Y = oversample.fit_resample(df_X, df_Y)

## Decision Tree

### Model Object

In [16]:
class ModelObj:
    def __init__(self, model, name, accuracy, clf_report, confusion_matrix, mcc, time_taken):
        self.model = model
        self.name = name
        self.accuracy = accuracy
        self.clf_report = clf_report
        self.confusion_matrix = confusion_matrix
        self.mcc = mcc
        self.time_taken = time_taken

In [17]:
def create_ModelObj(model, name, X, Y, class_arr):
    
    # Time Taken
    start = time.process_time()
    
    # Train Test Split5
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3)
    
    # Train Model
    model.fit(X_train, y_train)
    
    # Get Y Predict
    y_pred = model.predict(X_test)
    
    # Accuracy
    acc_score = get_acc_score_kcv(X_train, y_train, model)

    # Classification Report
    tf_dict = { str(ind):val for ind, val in enumerate(class_arr)}
    clf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict = True))
    clf_report.rename(tf_dict, axis = 1, inplace=True)
    clf_report = clf_report.T

    # Confusion Matrix
    tf_dict = { ind:val for ind, val in enumerate(class_arr)}
    confusion_matrix_df = pd.DataFrame(confusion_matrix(y_test, y_pred))
    confusion_matrix_df.rename(tf_dict, axis = 0, inplace=True)
    confusion_matrix_df.rename(tf_dict, axis = 1, inplace=True)
    
    # Matthew Correlation Coefficient
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # your code here 
    time_taken = time.process_time() - start
    
    return ModelObj(model, name, acc_score, clf_report, confusion_matrix_df, mcc, time_taken)

In [18]:
model_dict = {}

### Decision Tree

#### All Feature Set

In [19]:
from sklearn.tree import DecisionTreeClassifier

tmp_X = df_X
tmp_Y = df_Y

model = DecisionTreeClassifier()
name = "Decision Tree"

model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, df_Y_unique_arr)

Decision Tree Visualization

In [20]:
with open("df_X_columns.json", 'w') as fout:
    json_dumps_str = json.dumps({ "columns": df_X.columns.tolist() }, indent=4)
    print(json_dumps_str, file=fout)
    
print("Successfully output df_X_columns.json!")

Successfully output df_X_columns.json!


In [21]:
from dtreeviz.trees import *

viz_col = [name.split(":")[0] for name in tmp_X.columns]

# Output Name into a new JSON File

viz = dtreeviz(
    model_dict[name].model, 
    tmp_X, 
    tmp_Y, 
    feature_names = viz_col, 
    class_names = df_Y_unique_arr,
    fancy = True)

viz.save("res_data/decision_tree_viz.svg")



#### Intersection Feature Set

In [22]:
if len(inter_feature_set) > 0:
    tmp_X = df_X.loc[:, inter_feature_set]
    tmp_Y = df_Y

    model = DecisionTreeClassifier()
    name = "Decision Tree (IFS)"

    model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, df_Y_unique_arr)

#### Non-Intersection Feature Set

In [23]:
if len(non_inter_feature_set) > 0:
    tmp_X = df_X.loc[:, non_inter_feature_set]
    tmp_Y = df_Y

    model = DecisionTreeClassifier()
    name = "Decision Tree (NIFS)"

    model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, df_Y_unique_arr)

#### Union Feature Set

In [24]:
if len(union_feature_set) > 0:
    tmp_X = df_X.loc[:, union_feature_set]
    tmp_Y = df_Y

    model = DecisionTreeClassifier()
    name = "Decision Tree (UFS)"

    model_dict[name] = create_ModelObj(model, name, tmp_X, tmp_Y, df_Y_unique_arr)

### Result

In [25]:
m_arr = [(name, model_dict[name].clf_report, model_dict[name].accuracy, model_dict[name].mcc, model_dict[name].time_taken) for name in model_dict]
res_df = cmp_result_tbl(m_arr, "weighted avg")

with open("res_data/model_result_dataFrame.json", 'w') as fout:
    json_dumps_str = json.dumps(res_df.to_dict(), indent=4)
    print(json_dumps_str, file=fout)

### Precision

In [26]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "Precision")
fig = pfr_graph(tmp_df, "Model", "Score", "Precison Comparison")
fig.write_image("res_data/precision-graph.jpeg")

### Recall

In [27]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "Recall")
fig = pfr_graph(tmp_df, "Model", "Score", "Recall Comparison")
fig.write_image("res_data/recall-graph.jpeg")

### F1-Score

In [28]:
clf_report_arr = [(name, model_dict[name].clf_report) for name in model_dict]
tmp_df = get_df_type(clf_report_arr, "F1-Score")
fig = pfr_graph(tmp_df, "Model", "Score", "F1-Score Comparison")
fig.write_image("res_data/f1-score-graph.jpeg")

### Accuracy

In [29]:
acc_arr = [(key, model_dict[key].accuracy) for key in model_dict]
fig = acc_graph(acc_arr, "Accuracy Score Comparison", "Accuracy Score", "Types of Model")
fig.write_image("res_data/acc_score.jpeg")