# Neural Network from the ground up - Bank Fraud

Based on Chapter 4 of the Fast.AI Deep Learning Book

### Data Normalization

In [186]:
from fastcore.all import *
from fastai.data.all import *
from pandas import *
import numpy as np
import torch.utils.data as data_utils
import shutil
import requests as req

remote_url = 'https://cwikcode.com/wp-content/uploads/2022/12/data.zip'
local_file_name = 'data.zip'

# my hosting has some dumb security policy and faking a user agent seems to be the only way around it...
data = req.get(remote_url, headers={
    "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.42"})
with open('./data.zip', 'wb') as f:
    f.write(data.content)

print(data.status_code)
shutil.unpack_archive("data.zip", "./")
df = pd.read_csv("data.csv")

def normalize(x, max):
    return x / max

current_address_months_count_max = df.current_address_months_count.max()
customer_age_max = df.customer_age.max()
days_since_request_max = df.days_since_request.max()
intended_balcon_amount_max = df.intended_balcon_amount.max()
intended_balcon_amount_min = df.intended_balcon_amount.min()
zip_count_4w_max = df.zip_count_4w.max()
velocity_24h_max = df.velocity_24h.max()
velocity_6h_max = df.velocity_6h.max()
velocity_4w_max = df.velocity_4w.max()
bank_branch_count_8w_max = df.bank_branch_count_8w.max()
date_of_birth_distinct_emails_4w_max = df.date_of_birth_distinct_emails_4w.max()
credit_risk_score_max = df.credit_risk_score.max()
credit_risk_score_min = df.credit_risk_score.min()
proposed_credit_limit_max = df.proposed_credit_limit.max()

df["current_address_months_count"] = df.current_address_months_count.apply(lambda x: normalize(x, current_address_months_count_max))
df["customer_age"] = df.customer_age.apply(lambda x: normalize(x, customer_age_max))
df["days_since_request"] = df.days_since_request.apply(lambda x: normalize(x, days_since_request_max))
df["intended_balcon_amount"] = df.intended_balcon_amount.apply(lambda x: normalize(x + abs(intended_balcon_amount_min), intended_balcon_amount_max + abs(intended_balcon_amount_min)))
df["zip_count_4w"] = df.zip_count_4w.apply(lambda x: normalize(x, zip_count_4w_max))
df["velocity_24h"] = df.velocity_24h.apply(lambda x: normalize(x, velocity_24h_max))
df["velocity_6h"] = df.velocity_6h.apply(lambda x: normalize(x, velocity_6h_max))
df["velocity_4w"] = df.velocity_4w.apply(lambda x: normalize(x, velocity_4w_max))
df["bank_branch_count_8w"] = df.bank_branch_count_8w.apply(lambda x: normalize(x, bank_branch_count_8w_max))
df["date_of_birth_distinct_emails_4w"] = df.date_of_birth_distinct_emails_4w.apply(lambda x: normalize(x, date_of_birth_distinct_emails_4w_max))
df["credit_risk_score"] = df.credit_risk_score.apply(lambda x: normalize(x + abs(credit_risk_score_min), credit_risk_score_max + abs(credit_risk_score_min)))
df["proposed_credit_limit"] = df.proposed_credit_limit.apply(lambda x: normalize(x, proposed_credit_limit_max))

df.payment_type = df.payment_type.astype("category")
df.employment_status = df.employment_status.astype("category")
df.housing_status = df.housing_status.astype("category")
df.source = df.employment_status.astype("category")
df.device_os = df.device_os.astype("category")
df.to_csv("normalized.csv")


200


### Setup Training and Validation DataSets

In [187]:
dummies = pd.get_dummies(df)
print(dummies.shape)
fraud = dummies[dummies.fraud_bool == 1].drop('fraud_bool', axis=1)
not_fraud = dummies[dummies.fraud_bool == 0].drop('fraud_bool', axis=1)
print(fraud.shape)

fraud_train = fraud.sample(frac=.8, random_state=100)
fraud_valid = fraud.drop(fraud_train.index)
not_fraud_train = not_fraud.sample(frac=.8, random_state=100)
not_fraud_valid = not_fraud.drop(not_fraud_train.index)

train_x_t = torch.cat([torch.tensor(fraud_train.values).type(torch.FloatTensor), torch.tensor(not_fraud_train.values).type(torch.FloatTensor)])
print(train_x_t.shape)
train_x = train_x_t.view(-1, 52)
train_y = tensor([1]*len(fraud_train) + [0]*len(not_fraud_train))
dset = list(zip(train_x, train_y))

valid_x = torch.cat([torch.tensor(fraud_valid.values).type(torch.FloatTensor), torch.tensor(not_fraud_valid.values).type(torch.FloatTensor)]).view(-1, 52)
valid_y = tensor([1]*len(fraud_valid) + [0]*len(not_fraud_valid))
valid_dset = list(zip(valid_x, valid_y))

(1000000, 53)
(11029, 52)
torch.Size([800000, 52])


### Linear

In [188]:
def init_params(size, std=1.0): return (torch.randn(size)*std).type(torch.FloatTensor).requires_grad_()

weights = init_params(52); print(weights)
bias = init_params(1); print(bias)

print(weights.type())

def linear1(xb):
    cross = xb@weights
    return cross + bias

preds = linear1(train_x); preds

tensor([-0.6037, -0.9673, -1.5852,  0.1833,  1.3569,  2.2564,  0.0095,  0.6033,
         0.0239, -0.4166,  0.4479, -1.9311,  0.9349, -1.0090, -0.8661,  1.2145,
        -2.2250,  0.6223,  0.8982, -0.3833, -1.2193,  0.7720,  2.8475,  1.4876,
         0.0376, -0.5981,  0.2384,  0.3828, -0.8168, -0.3705,  0.9810,  0.8479,
         0.1698, -0.6509,  1.1193, -0.3699,  0.5117,  0.3170, -0.7046,  0.7504,
        -0.8489, -1.9622,  1.4156,  0.3581, -1.5099, -0.5391,  0.2263,  0.4392,
        -0.1941, -1.6669, -1.1988, -0.3204], requires_grad=True)
tensor([-0.0889], requires_grad=True)
torch.FloatTensor


tensor([-0.8128, -3.0254, -1.5078,  ..., -5.3643,  0.3158,  0.7568],
       grad_fn=<AddBackward0>)

In [189]:
corrects = (preds>0.0).float() == train_y
corrects

tensor([False, False, False,  ...,  True, False, False])

In [190]:
corrects.float().mean().item()

0.8161349892616272

In [191]:
def mse(preds, targets): return ((preds-targets)**2).mean().sqrt()

def measure_loss(predictions, targets):
    predictions = predictions.sigmoid()
    return torch.where(targets==1, 1-predictions, predictions).mean()

weights = init_params((52,1))
bias = init_params(1)

dl = DataLoader(dset, batch_size=512)
xb, yb = first(dl)
xb.shape, yb.shape

valid_dl = DataLoader(valid_dset, batch_size=512)

In [192]:
def calc_grad(xb, yb, model):
    preds = model(xb)
    loss = measure_loss(preds, yb)
    loss.backward()

def train_epoch(model, lr, params):
    for xb, yb in dl:
        calc_grad(xb, yb, model)
        for p in params:
            p.data -= p.grad*lr
            p.grad.zero_()

def batch_accuracy(xb, yb):
    preds = xb.sigmoid()
    correct = (preds>0.5) == yb
    return correct.float().mean()

def validate_epoch(model):
    accs = [batch_accuracy(model(xb), yb) for xb, yb in valid_dl]
    return round(torch.stack(accs).mean().item(), 4)

batch_accuracy(linear1(train_x[:12]), train_y[:12])


tensor(1.)

In [193]:
validate_epoch(linear1)

0.0758

In [194]:
lr = 0.001
params = weights, bias
train_epoch(linear1, lr, params)
validate_epoch(linear1)

for i in range(20):
    train_epoch(linear1, lr, params)
    print(validate_epoch(linear1), end=' ')
    

0.722 0.8808 0.9356 0.9587 0.9698 0.9764 0.9801 0.9824 0.984 0.9851 0.986 0.9867 0.9872 0.9875 0.9878 0.9881 0.9882 0.9884 0.9885 0.9886 

In [195]:
from fastai.vision.all import *

simple_net = nn.Sequential(
    nn.Linear(52, 30),
    nn.ReLU(),
    nn.Linear(30,1)
)

dls = DataLoaders(dl, valid_dl)
learn = Learner(dls, simple_net, opt_func=SGD, loss_func=measure_loss, metrics=batch_accuracy)
learn.fit(10, 0.0001)


epoch,train_loss,valid_loss,batch_accuracy,time
0,0.432043,0.432598,0.98897,00:05
1,0.4023,0.403524,0.98897,00:05
2,0.372417,0.37431,0.98897,00:05
3,0.342293,0.34486,0.98897,00:05
4,0.312027,0.315276,0.98897,00:05
5,0.281931,0.285872,0.98897,00:04
6,0.252506,0.25714,0.98897,00:05
7,0.224351,0.229666,0.98897,00:05
8,0.198047,0.204016,0.98897,00:05
9,0.174044,0.180627,0.98897,00:05
