In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

import time
import datetime

# returns data cols, data, target col, targets
def load_ccfraud():
    ds = pd.read_csv("dataset/fraudTrain.csv")
    print("File \"fraudTrain.csv\" loaded...")

    data_cols = [x for x in ds if x != "is_fraud"]
    target_col = "is_fraud"

    return ds, data_cols, [ds[col] for col in data_cols], target_col, ds[target_col]

dataframe, data_cols, data, target_col, target = load_ccfraud()



labelEncoderTransformTypes = {}

# constants
const_transac_date_format = "%Y-%m-%d %H:%M:%S"
# const_dob_format = "%Y-%m-%d"
def getTransactionDateUnix(string):
    return time.mktime(datetime.datetime.strptime(string, const_transac_date_format).timetuple())

def getDobFormat(string):
    return int(string.replace("-", ""))





def labelEncode(feature: str, dataframe, map):
    le = LabelEncoder()
    le.fit(dataframe[feature])
    dataframe[feature] = le.transform(dataframe[feature])
    map[feature] = le

def removeFeature(feature: str):
    dataframe.drop(feature, axis = 'columns')
    data_cols.remove(feature)


labelEncode("merchant", dataframe, labelEncoderTransformTypes)
labelEncode("category", dataframe, labelEncoderTransformTypes)
removeFeature("first")
removeFeature("last")
labelEncode("gender", dataframe, labelEncoderTransformTypes)
removeFeature("street")
removeFeature("city")
labelEncode("state", dataframe, labelEncoderTransformTypes)
removeFeature("trans_num")
removeFeature("job")

dataframe["trans_date_trans_time"] = [getTransactionDateUnix(x) for x in dataframe["trans_date_trans_time"]]
dataframe["dob"] = [getDobFormat(x) for x in dataframe["dob"]]

for x in data_cols:
    print(x + (" " * (50 - len(x))), "\t", dataframe[x].dtype)

File "fraudTrain.csv" loaded...
#                                                  	 int64
trans_date_trans_time                              	 float64
cc_num                                             	 int64
merchant                                           	 int32
category                                           	 int32
amt                                                	 float64
gender                                             	 int32
state                                              	 int32
zip                                                	 int64
lat                                                	 float64
long                                               	 float64
city_pop                                           	 int64
dob                                                	 int64
unix_time                                          	 int64
merch_lat                                          	 float64
merch_long                                         	 float64


In [21]:
# calculate the spearmans's correlation between two variables
from numpy.random import seed
from scipy.stats import spearmanr
seed(1)

correlations = {}

for one in data_cols:
    # print("a")
    correlations[one + ":" + target_col] = abs(getCorrelation(dataframe[target_col], dataframe[one]))
        
correlations = {k: v for k, v in sorted(correlations.items(), key=lambda item: item[1], reverse=True)}
        
        
for pair in correlations:
    print(pair, (" " * (70 - len(pair))), correlations[pair])


amt:is_fraud                                                            0.08792435760527545
category:is_fraud                                                       0.019713699518539417
dob:is_fraud                                                            0.011320164297940579
gender:is_fraud                                                         0.007641534190320536
#:is_fraud                                                              0.004767475532650962
unix_time:is_fraud                                                      0.004767475193174021
trans_date_trans_time:is_fraud                                          0.004767185918075516
long:is_fraud                                                           0.003209660840481879
merch_long:is_fraud                                                     0.0032052657928989636
zip:is_fraud                                                            0.002345518380890518
state:is_fraud                                                        

In [7]:
def getCorrelation(x_arr, y_arr):
    stat, pv = spearmanr(x_arr, y_arr)
    return stat