In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from collections import Counter

In [None]:
train_data = pd.read_csv("/content/fraudTrain.csv", index_col=0)

In [None]:
train_data.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [None]:
train_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [None]:
subset_data = train_data.copy()

In [None]:
from math import radians, cos, sin, asin, sqrt
def calculate_distance(row):
    lon1 = radians(row["long"])
    lon2 = radians(row["merch_long"])
    lat1 = radians(row["lat"])
    lat2 = radians(row["merch_lat"])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    r = 6371
    return(c * r)

In [None]:
subset_data["distance"] = subset_data.apply(calculate_distance, axis=1)

In [None]:
subset_data.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,distance
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0,78.597568
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0,30.212176
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0,108.206083
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0,95.673231
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0,77.556744


In [None]:
import datetime
subset_data = subset_data[["trans_date_trans_time", "dob", "amt", "city_pop", "distance", "is_fraud"]]
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud
0,2019-01-01 00:00:18,1988-03-09,4.97,3495.0,78.597568,0.0
1,2019-01-01 00:00:44,1978-06-21,107.23,149.0,30.212176,0.0
2,2019-01-01 00:00:51,1962-01-19,220.11,4154.0,108.206083,0.0
3,2019-01-01 00:01:16,1967-01-12,45.0,1939.0,95.673231,0.0
4,2019-01-01 00:03:06,1986-03-28,41.96,99.0,77.556744,0.0


In [None]:
subset_data.dtypes

trans_date_trans_time     object
dob                       object
amt                      float64
city_pop                 float64
distance                 float64
is_fraud                 float64
dtype: object

In [None]:
subset_data["trans_date"] = pd.to_datetime(pd.to_datetime(subset_data["trans_date_trans_time"], format="%Y-%m-%d %H:%M:%S").dt.date, format="%Y-%m-%d")
subset_data["dob_date"] = pd.to_datetime(subset_data["dob"], format="%Y-%m-%d")
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud,trans_date,dob_date
0,2019-01-01 00:00:18,1988-03-09,4.97,3495.0,78.597568,0.0,2019-01-01,1988-03-09
1,2019-01-01 00:00:44,1978-06-21,107.23,149.0,30.212176,0.0,2019-01-01,1978-06-21
2,2019-01-01 00:00:51,1962-01-19,220.11,4154.0,108.206083,0.0,2019-01-01,1962-01-19
3,2019-01-01 00:01:16,1967-01-12,45.0,1939.0,95.673231,0.0,2019-01-01,1967-01-12
4,2019-01-01 00:03:06,1986-03-28,41.96,99.0,77.556744,0.0,2019-01-01,1986-03-28


In [None]:
subset_data.dtypes

trans_date_trans_time            object
dob                              object
amt                             float64
city_pop                        float64
distance                        float64
is_fraud                        float64
trans_date               datetime64[ns]
dob_date                 datetime64[ns]
dtype: object

In [None]:
subset_data["age"] = (subset_data["trans_date"]-subset_data["dob_date"]) / (np.timedelta64(1, 'D')*365)
subset_data.head()

Unnamed: 0,trans_date_trans_time,dob,amt,city_pop,distance,is_fraud,trans_date,dob_date,age
0,2019-01-01 00:00:18,1988-03-09,4.97,3495.0,78.597568,0.0,2019-01-01,1988-03-09,30.835616
1,2019-01-01 00:00:44,1978-06-21,107.23,149.0,30.212176,0.0,2019-01-01,1978-06-21,40.558904
2,2019-01-01 00:00:51,1962-01-19,220.11,4154.0,108.206083,0.0,2019-01-01,1962-01-19,56.989041
3,2019-01-01 00:01:16,1967-01-12,45.0,1939.0,95.673231,0.0,2019-01-01,1967-01-12,52.005479
4,2019-01-01 00:03:06,1986-03-28,41.96,99.0,77.556744,0.0,2019-01-01,1986-03-28,32.786301


In [None]:
import numpy as np
import pandas as pd
import math

def f1_score(data, y, mask, total_frd, min_recall, fraud_col_name, amt_col_name):
	'''
	It returns the F1 score of a variable given a threshold.
	y: target variable.
	mask: split choice.
	total_frd: Total Fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	fraud_col_name: Column that contains Fraud Amount
	amt_col_name: Column that contains Total Amount
	'''

	hit_data = data[mask]

	capture = hit_data[fraud_col_name].sum()
	temp_tot_frd = data[fraud_col_name].sum()
	trx_amt_sum = hit_data[amt_col_name].sum()
	if trx_amt_sum > 0:
		hit_rate = capture/trx_amt_sum
	else:
		hit_rate = 0
	if temp_tot_frd > 0:
		capture_rate = capture/temp_tot_frd
	else:
		capture_rate = 0

	if hit_rate+capture_rate == 0:
		return 0

	f1 = (2*hit_rate*capture_rate)/(hit_rate+capture_rate)

	if f1 is None:
		return 0

	if capture_rate < min_recall:
		return 0

	return f1

def max_f1_score_split(data, x, y, total_frd, subset_x, min_recall, fraud_col_name, amt_col_name):
	'''
	Given a predictor & target variable, returns the best split, the error and the type of variable based on a selected cost function.
	x: predictor variable as Pandas Series.
	y: target variable as Pandas Series.
	total_frd: Total Fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	fraud_col_name: Column that contains Fraud Amount
	amt_col_name: Column that contains Total Amount
	'''

	split_value = []
	f1 = []
	le_gr = []
	print("Checking threshold for {}".format(x.name))
	options = subset_x.sort_values().unique()[1:]
	print("Old size was {}".format(len(options)))
	subset_x = subset_x[subset_x >= 0]

	options = subset_x.sort_values().unique()[1:]

	if len(options) > 100:
		arr_percentiles = np.arange(0,100)
		final_options = np.zeros(100)
		np.percentile(options, arr_percentiles, out = final_options)
	else:
		final_options = options
	print("Current size is {}".format(len(final_options)))

	# Calculate ig for all values
	for ind, val in enumerate(options):
		nonull_data = data[x>-9998]
		mask =   x < val
		val_f1 = f1_score(nonull_data, y, mask, total_frd, min_recall, fraud_col_name, amt_col_name)
		mask_2 = x > val
		val_new_f1 = f1_score(nonull_data, y, mask_2, total_frd, min_recall, fraud_col_name, amt_col_name)
		# Append results
		if val_new_f1 < val_f1:
			le_gr.append(1)
			f1.append(val_f1)
		else:
			le_gr.append(0)
			f1.append(val_new_f1)
		split_value.append(val)

	# Check if there are more than 1 results if not, return False
	if len(f1) == 0:
		return(None,None,None, False)

	else:
	# Get results with highest IG
		best_f1 = max(f1)
		best_f1_index = f1.index(best_f1)
		best_split = split_value[best_f1_index]
		best_ineq = le_gr[best_f1_index]
		return(best_f1,best_split,best_ineq, True)

def get_best_split(y, data, x_vars, total_frd, min_recall, fraud_col_name, amt_col_name):
	'''
	Given a data, select the best split and return the variable, the value, the variable type and the information gain.
	y: name of the target variable
	data: dataframe where to find the best split.
	x_vars: The variables used for decision making
	total_frd: Total fraud that can be captured in the starting population
	min_recall: Minimum recall set to be achieved
	fraud_col_name: Column that contains Fraud Amount
	amt_col_name: Column that contains Total Amount
	'''

	split_value = []
	f1 = []
	le_gr = []

	for x in x_vars:
		dropped_data = data.dropna(axis=0, subset = [x])
		fraud_data = dropped_data[dropped_data[y]==1]
		f1_score, split, ineq, _ = max_f1_score_split(dropped_data, dropped_data[x], dropped_data[y], total_frd, fraud_data[x], min_recall, fraud_col_name, amt_col_name)
		if f1_score is None:
			print("Found None")
			f1_score = 0
		le_gr.append(ineq)
		f1.append(f1_score)
		split_value.append(split)

	best_f1 = max(f1)
	best_f1_index = f1.index(best_f1)
	best_split = split_value[best_f1_index]
	best_ineq = le_gr[best_f1_index]
	best_var = x_vars[best_f1_index]
	return(best_var, best_split, best_f1, best_ineq)

def make_split(variable, value, data, ineq):
	'''
	Given a data and a split conditions, do the split.
	variable: variable with which make the split.
	value: value of the variable to make the split.
	data: data to be splitted.
	ineq: Greater than or less than inequality
	'''
	print(variable)
	print(value)
	if(ineq==1):
		data_1 = data[data[variable] < value]
	else:
		data_1 = data[data[variable] > value]

	return data_1

def calc_metrics(data, total_frd, fraud_col_name, amt_col_name):
	'''
	Given the target variable, make a prediction.
	data: pandas series for target variable
	total_frd: Total Fraud that can be captured in the starting population
	fraud_col_name: Column that contains Fraud Amount
	amt_col_name: Column that contains Total Amount
	'''

	hit_data = data

	capture = hit_data[fraud_col_name].sum()
	hit_rate = capture/hit_data[amt_col_name].sum()
	capture_rate = capture/total_frd

	return hit_rate, capture_rate

def train_tree(data,y, total_frd, x_vars, fraud_col_name="fraud_amount", amt_col_name="transaction_amount", max_depth = None, min_samples_split = None, min_recall = None, min_recall_overall = None, min_precision = 0.1, counter = 0):
	'''
	Trains a Decission Tree
	data: Data to be used to train the Decission Tree
	y: target variable column name
	total_frd: Total fraud that can be captured in the starting population
	x_vars: The variables used for decision making
	fraud_col_name: Column that contains Fraud Amount
	amt_col_name: Column that contains Total Amount
	max_depth: maximum depth to stop splitting.
	min_samples_split: minimum number of observations to make a split.
	min_recall: minimum recall for each threshold selection
	min_recall_overall: minimum recall for the overall rule
	min_precision: Stop once minimum precision reaches a certain level
	'''

	# check for depth conditions
	print(counter)
	precision, recall = calc_metrics(data, total_frd, fraud_col_name, amt_col_name)

	if max_depth == None:
		depth_cond = True

	else:
		if counter < max_depth:
			depth_cond = True

		else:
			depth_cond = False

	# Check for sample conditions
	if min_samples_split == None:
		sample_cond = True

	else:
		if data.shape[0] > min_samples_split:
			sample_cond = True

		else:
			sample_cond = False

	# Check for recall condition
	if min_recall_overall == None:
		recall_cond = True

	else:
		if min_recall_overall < recall:
			recall_cond = True

		else:
			recall_cond = False

	# Check for condition
	if depth_cond & sample_cond & recall_cond:

		var, val, f1, ineq = get_best_split(y, data, x_vars, total_frd, min_recall, fraud_col_name, amt_col_name)
		counter += 1

		new_data = make_split(var, val, data, ineq)

		# Instantiate sub-tree
		if ineq == 1:
			split_type = "<"
		else:
			split_type = ">"
		question = "{} {} {}".format(var, split_type, val)
		new_precision, new_recall = calc_metrics(new_data, total_frd, fraud_col_name, amt_col_name)
		question = question + " Precision:{}, Recall:{}".format(new_precision, new_recall)
		path = [question]
		print(question)


		#Find answers (recursion)

		next_path = train_tree(new_data, y, total_frd, x_vars, fraud_col_name, amt_col_name, max_depth, min_samples_split, min_recall, min_recall_overall, min_precision, counter)

		if next_path is not None:
			path.extend(next_path)

		return path

	print("Run Complete")
	return None

In [None]:
algo_data = subset_data.copy()
#algo_data["transaction_amount"] = algo_data["amt"]
algo_data["fraud_amount"] = np.where(algo_data["is_fraud"]==1, algo_data["amt"], 0)
algo_data = algo_data[["amt", "city_pop", "age", "fraud_amount", "distance", "is_fraud"]]
algo_data.head()

Unnamed: 0,amt,city_pop,age,fraud_amount,distance,is_fraud
0,4.97,3495.0,30.835616,0.0,78.597568,0.0
1,107.23,149.0,40.558904,0.0,30.212176,0.0
2,220.11,4154.0,56.989041,0.0,108.206083,0.0
3,45.0,1939.0,52.005479,0.0,95.673231,0.0
4,41.96,99.0,32.786301,0.0,77.556744,0.0


In [None]:
total_frd = algo_data["fraud_amount"].sum()
print(total_frd)

204363.65000000002


In [None]:
algo_data.shape

(38933, 6)

In [None]:
rf_vars = ["amt", "city_pop", "age", "distance"]
decisions = train_tree(algo_data,"is_fraud", total_frd, rf_vars, "fraud_amount", "amt", max_depth = 50, min_samples_split = None, min_recall = 0.9, min_recall_overall = 0.3, min_precision = None, counter = 0)

0
Checking threshold for amt
Old size was 385
Current size is 100
Checking threshold for city_pop
Old size was 34
Current size is 34
Checking threshold for age
Old size was 71
Current size is 71
Checking threshold for distance
Old size was 386
Current size is 100
amt
275.15
amt > 275.15 Precision:0.2708271321184529, Recall:0.974310108475749
1
Checking threshold for amt
Old size was 279
Current size is 100
Checking threshold for city_pop
Old size was 34
Current size is 34
Checking threshold for age
Old size was 70
Current size is 70
Checking threshold for distance
Old size was 280
Current size is 100
amt
1191.47
amt < 1191.47 Precision:0.3262601239450596, Recall:0.9558145981440437
2
Checking threshold for amt
Old size was 276
Current size is 100
Checking threshold for city_pop
Old size was 34
Current size is 34
Checking threshold for age
Old size was 70
Current size is 70
Checking threshold for distance
Old size was 277
Current size is 100
city_pop
276002.0
city_pop < 276002.0 Precision

In [None]:
decisions

['amt > 275.15 Precision:0.2708271321184529, Recall:0.974310108475749',
 'amt < 1191.47 Precision:0.3262601239450596, Recall:0.9558145981440437',
 'city_pop < 276002.0 Precision:0.3352771033949777, Recall:0.8975419063028086',
 'amt > 294.63 Precision:0.34002867214751237, Recall:0.8794776370455313',
 'age < 83.98630136986301 Precision:0.34133469096378344, Recall:0.8642296709811161',
 'amt > 296.32 Precision:0.34098444162084574, Recall:0.8598921579253453',
 'amt > 298.99 Precision:0.34061532407861395, Recall:0.8569722159493626',
 'amt > 299.98 Precision:0.34024325625119656, Recall:0.8540409216609703',
 'amt > 308.33 Precision:0.3424630877069338, Recall:0.8377157092271547',
 'amt > 417.97 Precision:0.3715486270517817, Recall:0.7563196292491349',
 'amt > 688.75 Precision:0.6057870669506785, Recall:0.7218717712274173',
 'amt < 1177.79 Precision:0.6109177950939438, Recall:0.7103215762685781',
 'distance < 132.6117054514996 Precision:0.6144068820986724, Recall:0.7001469194741823',
 'distance 