In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from math import radians, cos, sin, asin, sqrt

# Data Loading and Preprocessing

In [59]:
data_test = pd.read_csv('Data/fraudTest.csv')
data_train = pd.read_csv('Data/fraudTrain.csv')

data = pd.concat([data_train,data_test])
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['hour'] = data['trans_date_trans_time'].dt.hour

data['day_of_week'] = data['trans_date_trans_time'].dt.day_name()

data['year_month'] = data['trans_date_trans_time'].dt.to_period('M')

data['dob'] = pd.to_datetime(data['dob'])
data['age'] = np.round((data['trans_date_trans_time'] - data['dob'])/np.timedelta64(1,'Y'))

In [123]:
data_test[0:20]['is_fraud']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
Name: is_fraud, dtype: int64

In [60]:
len(data_train.category.unique())

14

Calculating Distance between merchant and customer

In [61]:
def distance(lat1, lat2, lon1, lon2):
     
    lon1 = np.radians(lon1)
    lon2 = np.radians(lon2)
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)
      
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers
    r = 6371
      
    # calculate the result
    return(c * r)

In [62]:
data['dist'] = data.apply(lambda x: distance(x.merch_lat, x.lat, x.merch_long, x.long), axis=1)

Calculating the average of fraud happened with one customer

In [63]:
Fraud_avg = {}
for i in data['cc_num'].unique():
    Fraud_avg[i] = len(data[(data['cc_num'] == i) & data['is_fraud'] == 1]) / len(data[data["cc_num"] == i])

In [64]:
data['Fraud_avg'] = data['cc_num'].map(Fraud_avg)

Calculating average amt of fraud and non-fraud cases for specific person

In [65]:
Amt_Avg_Fraud = {}
Amt_Avg_Legit = {}

for i in data['cc_num'].unique():
    Amt_Avg_Fraud[i] = data[(data['cc_num'] == i) & (data['is_fraud'] == 1)]['amt'].mean()
    Amt_Avg_Legit[i] = data[(data['cc_num'] == i) & (data['is_fraud'] == 0)]['amt'].mean()

In [66]:
data['Amt_Avg_Fraud'] = data['cc_num'].map(Amt_Avg_Fraud)
data['Amt_Avg_Legit'] = data['cc_num'].map(Amt_Avg_Legit)

In [67]:
data = data.replace(np.NaN,0)

Normalising the amount

In [68]:
data['amt_normalised'] = data['amt'] / max(data['amt'])

Taking care of categorical variable by label encoding and normalising

In [69]:
cities = {}
states = {}

i = 0
for city in data.city.unique():
    cities[city] = i
    i += 1
i = 0
for state in data.state.unique():
    states[state] = i
    i += 1
data['city'] = data.city.map(cities)
data['state'] = data.state.map(states)

In [70]:
data['city'] = data['city'] / len(cities)
data['state'] = data['state'] / len(states)

In [71]:
i = 0
merchants = {}
for merchant in data.merchant.unique():
    merchants[merchant] = i
    i += 1
data.merchant = data.merchant.map(merchants) / len(merchants)

In [72]:
i = 0 
days = {}
for day in data.day_of_week.unique():
    days[day] = i
    i += 1
data.day_of_week = data.day_of_week.map(days)

In [73]:
i = 0
jobs = {}
for job in data.job.unique():
    jobs[job] = i
    i += 1
data.job = data.job.map(jobs)

In [53]:
Category = {}
for i, cate in enumerate(data_copy.category.unique()):
    Category[cate] = i
Category

{'misc_net': 0,
 'grocery_pos': 1,
 'entertainment': 2,
 'gas_transport': 3,
 'misc_pos': 4,
 'grocery_net': 5,
 'shopping_net': 6,
 'shopping_pos': 7,
 'food_dining': 8,
 'personal_care': 9,
 'health_fitness': 10,
 'travel': 11,
 'kids_pets': 12,
 'home': 13}

In [76]:
data['category'] = data.category.map(Category)

In [77]:
data['year_month'] = data['year_month'].astype('str')

In [78]:
data_train_col = ['hour','state','merchant','age','day_of_week','job','category','amt_normalised','dist','Fraud_avg','Amt_Avg_Legit','Amt_Avg_Fraud','is_fraud']

In [86]:
data = data[data_train_col]

Separting the train and test data

In [87]:
train = data[:1296675]
test = data[1296675:]

In [88]:
train_x = train.drop('is_fraud',axis = 1)
train_y = train['is_fraud']

test_x = test.drop('is_fraud',axis = 1)
test_y = test['is_fraud']

In [89]:
train_x

Unnamed: 0,hour,state,merchant,age,day_of_week,job,category,amt_normalised,dist,Fraud_avg,Amt_Avg_Legit,Amt_Avg_Fraud
0,0,0.000000,0.000000,31.0,0,0,0,0.000172,78.597568,0.003758,88.203104,409.012727
1,0,0.019608,0.001443,41.0,0,1,1,0.003704,30.212176,0.001605,55.602852,351.758571
2,0,0.039216,0.002886,57.0,0,2,2,0.007603,108.206083,0.010884,65.620633,461.017500
3,0,0.058824,0.004329,52.0,0,3,3,0.001554,95.673231,0.020188,70.613365,540.024667
4,0,0.078431,0.005772,33.0,0,4,4,0.001449,77.556744,0.004449,93.270633,558.658462
...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,12,0.568627,0.274170,59.0,5,331,2,0.000537,119.752136,0.004098,59.491651,574.250000
1296671,12,0.627451,0.930736,41.0,5,250,8,0.001786,75.104085,0.010840,92.068986,647.965000
1296672,12,0.235294,0.584416,53.0,5,14,8,0.003659,99.047734,0.003765,61.814473,662.728182
1296673,12,0.882353,0.740260,40.0,5,194,8,0.002587,84.627652,0.002743,92.095299,623.337500


In [90]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42) 

train_x, train_y = ros.fit_resample(train_x, train_y)

# Testing Classification models 

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [92]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, random_state = 42)

In [93]:
x_train

Unnamed: 0,hour,state,merchant,age,day_of_week,job,category,amt_normalised,dist,Fraud_avg,Amt_Avg_Legit,Amt_Avg_Fraud
1780783,23,0.607843,0.402597,51.0,5,139,1,0.012105,108.784231,0.013605,59.031945,464.676000
59304,19,0.725490,0.049062,18.0,6,31,1,0.002981,69.105096,0.017497,55.562288,655.350000
2133391,22,0.372549,0.427128,25.0,2,51,6,0.029483,90.687483,0.002743,93.250378,291.941250
2417226,23,0.254902,0.907648,40.0,6,228,7,0.032363,84.664521,0.005016,63.810353,506.398182
851986,20,0.235294,0.972583,52.0,5,14,8,0.000801,73.340382,0.003765,61.814473,662.728182
...,...,...,...,...,...,...,...,...,...,...,...,...
110268,2,0.352941,0.292929,54.0,6,295,3,0.002500,106.885759,0.004778,63.853855,399.380000
1692743,1,0.705882,0.326118,32.0,4,330,1,0.010151,107.882443,0.008169,53.738195,747.288333
2356330,23,0.647059,0.741703,21.0,4,216,6,0.033009,70.993045,0.007269,44.550966,639.461875
2229084,19,0.098039,0.704185,46.0,1,221,12,0.000793,45.414551,0.003015,54.287031,545.840000


In [94]:
x_train.to_csv("Flask\Data.csv")

In [95]:
import pickle

In [110]:
with open('Flask\Dicts\Cities.pkl', 'wb') as f:
    pickle.dump(cities, f)

with open('Flask\Dicts\States.pkl', 'wb') as f:
    pickle.dump(states, f)
    
with open('Flask\Dicts\Merchants.pkl', 'wb') as f:
    pickle.dump(merchants, f)

with open('Flask\Dicts\Jobs.pkl', 'wb') as f:
    pickle.dump(jobs, f)
    
with open('Flask\Dicts\Days.pkl', 'wb') as f:
    pickle.dump(days, f)

with open('Flask\Dicts\Category.pkl', 'wb') as f:
    pickle.dump(Category, f)

# Datasets
#### Training Data -> x_train y_train
#### Val Data -> x_val y_val
#### Test Data -> test_x test_y

## Logistic Regression 

In [97]:
logreg = LogisticRegression(C = 0.1)
logreg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.1)

In [114]:
y_log_val = logreg.predict(x_val)
y_log_tra = logreg.predict(x_train)
y_log_test = logreg.predict(test_x)
print("Training F1 -> {}".format(f1_score(y_train,y_log_tra)))
print("Val F1 -> {}".format(f1_score(y_val,y_log_val)))
print("Test F1 -> {}".format(f1_score(test_y,y_log_test)))

Training F1 -> 0.7056536857578936
Val F1 -> 0.7069006162017182
Test F1 -> 0.01780068769909852


In [99]:
print("Training Acc -> {}".format(logreg.score(x_train,y_train)))
print("Val Acc -> {}".format(logreg.score(x_val,y_val)))
print("Test Acc -> {}".format(logreg.score(test_x,test_y)))

Training Acc -> 0.6523020261636311
Val Acc -> 0.652782798234523
Test Acc -> 0.6424110026830107


In [100]:
y_log = logreg.predict(test_x)
print("f1_score test -> {}".format(f1_score(test_y,y_log))) 

f1_score test -> 0.01369869812734826


## Random Forest

In [63]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
rfc = RandomForestClassifier(n_estimators = 100,
                            max_depth = 8,
                            max_features = "sqrt",
                            bootstrap = True,
                            criterion = "entropy")

In [69]:
rfc.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=8, max_features='sqrt')

In [70]:
y_rfc_val = rfc.predict(x_val)
y_rfc_tra = rfc.predict(x_train)
y_rfc_test = rfc.predict(test_x)
print("Training F1 -> {}".format(f1_score(y_train,y_rfc_tra)))
print("Val F1 -> {}".format(f1_score(y_val,y_rfc_val)))
print("Test F1 -> {}".format(f1_score(test_y,y_rfc_test)))

Training F1 -> 0.8909333250674188
Val F1 -> 0.8909710017945562
Test F1 -> 0.24648805213613323


In [71]:
print("Training Acc -> {}".format(accuracy_score(y_train,y_rfc_tra)))
print("Val Acc -> {}".format(accuracy_score(y_val,y_rfc_val)))
print("Test Acc -> {}".format(accuracy_score(test_y,y_rfc_test)))

Training Acc -> 0.8999236200279974
Val Acc -> 0.8998068524709697
Test Acc -> 0.9812747089806179


## LightGBM 

In [72]:
from lightgbm import LGBMClassifier

In [97]:
lgbmc = LGBMClassifier(n_estimators = 100,
                      max_depth = 32,
                      learning_rate = 0.02,
                      num_leaves = 20,
                       #reg_lambda  = True,
                       random_state = 42,
                       subsample = 0.5
                      )

In [98]:
lgbmc.fit(x_train,y_train)

LGBMClassifier(learning_rate=0.02, max_depth=32, num_leaves=20, random_state=42,
               subsample=0.5)

In [99]:
y_lgbmc_val = lgbmc.predict(x_val,num_iteration = 100)
y_lgbmc_tra = lgbmc.predict(x_train,num_iteration = 100)
y_lgbmc_test = lgbmc.predict(test_x,num_iteration = 100)
print("Training F1 -> {}".format(f1_score(y_train,y_lgbmc_tra)))
print("Val F1 -> {}".format(f1_score(y_val,y_lgbmc_val)))
print("Test F1 -> {}".format(f1_score(test_y,y_lgbmc_test)))

Training F1 -> 0.9610377568907266
Val F1 -> 0.9610960543831532
Test F1 -> 0.14535255272314138


In [100]:
print("Training Acc -> {}".format(accuracy_score(y_train,y_lgbmc_tra)))
print("Val Acc -> {}".format(accuracy_score(y_val,y_lgbmc_val)))
print("Test Acc -> {}".format(accuracy_score(test_y,y_lgbmc_test)))

Training Acc -> 0.960884869991152
Val Acc -> 0.9608724993600534
Test Acc -> 0.9566831438190885


## Xgboost Classifier

In [101]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(
                    max_depth = 4,
                    min_child_wight = 6,
                    learning_rate = 0.1,
                    n_estimators = 100,
                    gamma = 0,
                    subsample = 0.4,
                    nthread = 4,
                     colsample_bytree=0.6,
                     scale_pos_weight=1
                    )

In [102]:
xgbc = XGBClassifier(max_depth = 10,
                    n_estimators = 100)

In [103]:
eval_set = [(x_val, y_val)]
xgbc.fit(train_x,train_y,early_stopping_rounds = 10,eval_set = eval_set)



[0]	validation_0-logloss:0.45194
[1]	validation_0-logloss:0.31872
[2]	validation_0-logloss:0.23429
[3]	validation_0-logloss:0.17680
[4]	validation_0-logloss:0.13653
[5]	validation_0-logloss:0.10551
[6]	validation_0-logloss:0.08422
[7]	validation_0-logloss:0.06826
[8]	validation_0-logloss:0.05535
[9]	validation_0-logloss:0.04647
[10]	validation_0-logloss:0.03911
[11]	validation_0-logloss:0.03435
[12]	validation_0-logloss:0.02946
[13]	validation_0-logloss:0.02663
[14]	validation_0-logloss:0.02446
[15]	validation_0-logloss:0.02222
[16]	validation_0-logloss:0.02093
[17]	validation_0-logloss:0.01979
[18]	validation_0-logloss:0.01816
[19]	validation_0-logloss:0.01692
[20]	validation_0-logloss:0.01637
[21]	validation_0-logloss:0.01593
[22]	validation_0-logloss:0.01564
[23]	validation_0-logloss:0.01443
[24]	validation_0-logloss:0.01371
[25]	validation_0-logloss:0.01309
[26]	validation_0-logloss:0.01282
[27]	validation_0-logloss:0.01198
[28]	validation_0-logloss:0.01120
[29]	validation_0-loglos

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [104]:
y_xgbc_val = xgbc.predict(x_val)
y_xgbc_tra = xgbc.predict(x_train)
y_xgbc_test = xgbc.predict(test_x)
print("Training F1 -> {}".format(f1_score(y_train,y_xgbc_tra)))
print("Val F1 -> {}".format(f1_score(y_val,y_xgbc_val)))
print("Test F1 -> {}".format(f1_score(test_y,y_xgbc_test)))

Training F1 -> 0.9999270633695172
Val F1 -> 0.9999364671062316
Test F1 -> 0.7965169569202566


In [105]:
print(classification_report(test_y, y_xgbc_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.78      0.81      0.80      2145

    accuracy                           1.00    555719
   macro avg       0.89      0.90      0.90    555719
weighted avg       1.00      1.00      1.00    555719



In [106]:
print("Training Acc -> {}".format(accuracy_score(y_train,y_xgbc_tra)))
print("Val Acc -> {}".format(accuracy_score(y_val,y_xgbc_val)))
print("Test Acc -> {}".format(accuracy_score(test_y,y_xgbc_test)))

Training Acc -> 0.9999270847931457
Val Acc -> 0.9999363931832109
Test Acc -> 0.9984020701109734


# Saving Best Model

In [124]:
import pickle

pickl = {'model': xgbc}
pickle.dump( pickl, open( 'Flask\model_XGBC' + ".p", "wb" ) )

In [119]:
import pickle
file_name = "model_XGBC.p"
with open(file_name, "rb") as pickled:
    data = pickle.load(pickled)
    model = data['model']
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [120]:
xgbc.predict(np.array(x_train[0:1]))

array([1], dtype=int64)

In [121]:
np.array(x_train[0:1])

array([[2.30000000e+01, 6.07843137e-01, 4.02597403e-01, 5.10000000e+01,
        5.00000000e+00, 1.39000000e+02, 1.00000000e+00, 1.21054686e-02,
        1.08784231e+02, 1.36054422e-02, 5.90319448e+01, 4.64676000e+02]])

In [122]:
np.shape(np.array(x_train[0:1]))

(1, 12)