<a href="https://colab.research.google.com/github/Kennedy87670/DatafestAfrica2023_Fruad_Detection_system/blob/main/Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install catboost optuna
!pip install gwpy &> /dev/null

!command > /dev/null 2>&1





## Fraud Detection for Online Payment Platform
The Fraud Detection dataset is a crucial asset for our business, providing valuable insights and opportunities for enhancing the security and trustworthiness of our online payment platform. This dataset represents transactions and user-related data collected over time from our platform.
1. Problem Definition
2. Data
3. Evaluation
4. Featuring
5. Modelling
6. Experimentation

## Problem Defintion
The primary goal is to develop an advanced predictive model to identify potentially fraudulent transactions.

## Data
The data was shared https://portfolio.diceytech.co.uk/project-opportunity/1694535858265x547950582392422400

## Evaluation

## Features
https://portfolio.diceytech.co.uk/project-opportunity/1694535858265x547950582392422400


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.rcParams["figure.figsize"] = (12, 8)
pd.set_option('display.max_columns', None)

In [3]:
#Plot the Features Importances
def plotImp(model, X , num = 30, fig_size = (60, 30)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                        ascending=False)[0:num])
    plt.title('Catboost Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('cb_importances-01.png')
    plt.show()
    return feature_imp.sort_values(by="Value",ascending=False)

In [4]:
#Plot the Features Importances
def plotImp(model, X , num = 30, fig_size = (60, 30)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                        ascending=False)[0:num])
    plt.title('Catboost Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('cb_importances-01.png')
    plt.show()
    return feature_imp.sort_values(by="Value",ascending=False)




#Reduce Memory Usage
def reduce_memory_usage(df):

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')

    return df

In [5]:
#@markdown <br><center><img src='https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Google_Drive_logo.png/600px-Google_Drive_logo.png' height="150" alt="Gdrive-logo"/></center>
#@markdown <center><h2>Mount GDrive to /content/drive</h3></center><br>
MODE = "MOUNT" #@param ["MOUNT", "UNMOUNT"]
#Mount your Gdrive!
from google.colab import drive
drive.mount._DEBUG = False
if MODE == "MOUNT":
  drive.mount('/content/drive', force_remount=True)
elif MODE == "UNMOUNT":
  try:
    drive.flush_and_unmount()
  except ValueError:
    pass
  get_ipython().system_raw("rm -rf /root/.config/Google/DriveFS")

Mounted at /content/drive


In [15]:
# load the data
path = '/content/drive/MyDrive/datafest' #/content/drive/MyDrive/DatafestAfrica2023 Data Science Datathon/Fraud Detection Dataset.csv

In [16]:
data = pd.read_csv(f'{path}/Fraud Detection Dataset.csv')

## Data exploration (exploratory data analysis or EDA)



In [17]:
data.shape

(6000000, 32)

In [18]:
data.head()

Unnamed: 0,Transaction ID,User ID,Transaction Amount,Transaction Date and Time,Merchant ID,Payment Method,Country Code,Transaction Type,Device Type,IP Address,Browser Type,Operating System,Merchant Category,User Age,User Occupation,User Income,User Gender,User Account Status,Transaction Status,Location Distance,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,User's Device Location,Transaction Currency,Transaction Purpose,User's Credit Score,User's Email Domain,Merchant's Business Age,Transaction Authentication Method,Fraudulent Flag
0,51595306,9822,163.08,2023-01-02 07:47:54,4044,ACH Transfer,KOR,Charity,GPS Device,42.23.223.120,Links,Tizen,Industrial & Scientific,68,Doctor,66826.21,Prefer Not to Say,Pro,In Transit,9.34,24.22,Daytime,26,2.71,United Kingdom,NOK,Consultation Fee,343,cox.co.uk,3,Bluetooth Authentication,0
1,85052974,4698,430.74,2021-09-12 15:15:41,4576,2Checkout,VNM,Cashback,Medical Device,39.52.212.120,Beaker,Windows Server,Beauty & Cosmetics,22,Chemist,89356.71,Genderqueer,Pending Approval,Resolved,65.28,55.11,Daytime,60,3.95,Mexico,EGP,Cashback Reward,688,gmail.com,13,NFC Tag,1
2,23954324,8666,415.74,2023-01-12 17:25:58,4629,Google Wallet,MEX,Reward,Vehicle Infotainment System,243.180.236.29,Opera,Android,Real Estate,71,Nurse,58438.63,Male,Pro,Posted,44.05,53.84,Daytime,81,3.81,Qatar,MXN,Acquisition,371,rocketmail.com,7,Token,1
3,44108303,9012,565.89,2021-02-27 11:31:00,3322,Check,SGP,Purchase,Kiosk,212.186.227.14,Konqueror,CentOS,Appliances,78,Nurse,3426.92,Agender,Premium,Closed,21.7,21.62,Daytime,18,2.67,Spain,CLP,Loan Repayment,687,roadrunner.co.uk,15,Time-Based OTP,1
4,66622683,5185,955.49,2022-09-24 04:06:38,7609,Worldpay,HKG,Acquisition,Smart Mirror,166.113.10.199,Basilisk,Ubuntu,Jewelry,31,Physicist,53080.12,Male,Free,Refunded,56.63,53.71,Daytime,98,3.19,Israel,RUB,Dividend Reinvestment,605,protonmail.co.uk,17,Password,1


In [19]:
data.tail()

Unnamed: 0,Transaction ID,User ID,Transaction Amount,Transaction Date and Time,Merchant ID,Payment Method,Country Code,Transaction Type,Device Type,IP Address,Browser Type,Operating System,Merchant Category,User Age,User Occupation,User Income,User Gender,User Account Status,Transaction Status,Location Distance,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,User's Device Location,Transaction Currency,Transaction Purpose,User's Credit Score,User's Email Domain,Merchant's Business Age,Transaction Authentication Method,Fraudulent Flag
5999995,61037029,7480,448.99,2021-10-20 15:56:32,3346,Discover,SGP,Scholarship,Server,255.134.160.201,Chrome,Windows Mobile,Fitness & Nutrition,77,Plumber,81533.28,Genderqueer,Standard,Held for Security,8.15,59.18,Daytime,34,2.78,Russia,CHF,Invoice Payment,679,aim.com,14,Retina Scan,0
5999996,56515851,5636,841.39,2021-06-14 02:10:00,8415,Alipay,ZAF,Loan,Digital Camera,48.190.84.14,Safari,iOS,Appliances,42,Researcher,85016.11,Other,Active,Hold,15.01,19.92,Nighttime,80,2.6,Malaysia,HUF,Membership,706,cox.net,10,Social Media Login,1
5999997,66863972,5554,197.28,2021-11-06 22:33:19,4231,Afterpay,CAN,Service Charge,Barcode Scanner,7.21.196.39,Internet Explorer,Sailfish OS,Sporting Goods,45,Nurse,33282.35,Agender,Pending Approval,Void,91.18,25.83,Evening,12,1.35,Egypt,HKD,Admission,310,live.co.uk,14,Mobile App Notification,0
5999998,13449701,1275,358.33,2022-03-13 15:02:35,9614,JCB,UK,Fine,Robot,211.202.242.100,Waterfox,Chrome OS,Baby & Maternity,24,Biologist,6896.13,Non-Binary,Unverified,Processed,90.3,38.15,Nighttime,57,1.29,China,AED,Expense Reimbursement,460,rediffmail.com,16,Authentication App,0
5999999,89299426,5418,655.09,2022-10-20 12:04:27,7515,Payoneer,QAT,Refund,Smart Appliance,44.101.127.186,Lynx,QNX,Beauty & Cosmetics,57,Chef,40323.01,Male,Trial,In Transit,53.92,55.57,Evening,57,2.35,New Zealand,NOK,Rental Payment,340,live.co.uk,4,Password,1


In [20]:
data.describe()

Unnamed: 0,Transaction ID,User ID,Transaction Amount,Merchant ID,User Age,User Income,Location Distance,Time Taken for Transaction,User's Transaction History,Merchant's Reputation Score,User's Credit Score,Merchant's Business Age,Fraudulent Flag
count,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0,6000000.0
mean,55019059.604,5499.571,500.552,5500.817,49.002,50493.28,50.506,30.495,50.484,2.999,574.953,10.497,0.5
std,25980671.714,2598.744,288.417,2598.35,18.185,28582.234,28.576,17.036,28.867,1.154,159.02,5.765,0.5
min,10000041.0,1000.0,1.0,1000.0,18.0,1000.02,1.0,1.0,1.0,1.0,300.0,1.0,0.0
25%,32516659.75,3248.0,250.79,3251.0,33.0,25734.458,25.76,15.74,25.0,2.0,437.0,5.0,0.0
50%,55029874.0,5499.0,500.51,5501.0,49.0,50502.94,50.51,30.5,50.0,3.0,575.0,10.0,0.0
75%,77524866.25,7751.0,750.44,7752.0,65.0,75234.535,75.25,45.26,75.0,4.0,713.0,15.0,1.0
max,99999883.0,9999.0,1000.0,9999.0,80.0,100000.0,100.0,60.0,100.0,5.0,850.0,20.0,1.0


In [21]:
#data.rename(columns = {'Fraudulent Flag':'Fraudulent'}, inplace = True)

In [22]:
# Check for missing values
print("Missing Values:\n", data.isnull().sum())


Missing Values:
 Transaction ID                       0
User ID                              0
Transaction Amount                   0
Transaction Date and Time            0
Merchant ID                          0
Payment Method                       0
Country Code                         0
Transaction Type                     0
Device Type                          0
IP Address                           0
Browser Type                         0
Operating System                     0
Merchant Category                    0
User Age                             0
User Occupation                      0
User Income                          0
User Gender                          0
User Account Status                  0
Transaction Status                   0
Location Distance                    0
Time Taken for Transaction           0
Transaction Time of Day              0
User's Transaction History           0
Merchant's Reputation Score          0
User's Device Location               0
Transact

## Adding new Features


In [25]:
# Transaction Month
data['Transaction Month'] = pd.to_datetime(data['Transaction Date and Time']).dt.month

# Transaction Hour
data['Transaction Hour'] = pd.to_datetime(data['Transaction Date and Time']).dt.hour

# Time taken for Transaction( Minutes)
data['Time Taken for Transaction (Minutes)'] = data['Time Taken for Transaction'] / 60

# Age grouping
data['Age Group'] = pd.cut(data['User Age'], bins=[0, 18, 35, 60, 100], labels=['Child', 'Young Adult', 'Adult', 'Elders'])

#



In [26]:
data.head()

Unnamed: 0,Transaction ID,User ID,Transaction Amount,Transaction Date and Time,Merchant ID,Payment Method,Country Code,Transaction Type,Device Type,IP Address,Browser Type,Operating System,Merchant Category,User Age,User Occupation,User Income,User Gender,User Account Status,Transaction Status,Location Distance,Time Taken for Transaction,Transaction Time of Day,User's Transaction History,Merchant's Reputation Score,User's Device Location,Transaction Currency,Transaction Purpose,User's Credit Score,User's Email Domain,Merchant's Business Age,Transaction Authentication Method,Fraudulent Flag,Transaction Month,Transaction Hour,Time Taken for Transaction (Minutes),Age Group
0,51595306,9822,163.08,2023-01-02 07:47:54,4044,ACH Transfer,KOR,Charity,GPS Device,42.23.223.120,Links,Tizen,Industrial & Scientific,68,Doctor,66826.21,Prefer Not to Say,Pro,In Transit,9.34,24.22,Daytime,26,2.71,United Kingdom,NOK,Consultation Fee,343,cox.co.uk,3,Bluetooth Authentication,0,1,7,0.404,Elders
1,85052974,4698,430.74,2021-09-12 15:15:41,4576,2Checkout,VNM,Cashback,Medical Device,39.52.212.120,Beaker,Windows Server,Beauty & Cosmetics,22,Chemist,89356.71,Genderqueer,Pending Approval,Resolved,65.28,55.11,Daytime,60,3.95,Mexico,EGP,Cashback Reward,688,gmail.com,13,NFC Tag,1,9,15,0.918,Young Adult
2,23954324,8666,415.74,2023-01-12 17:25:58,4629,Google Wallet,MEX,Reward,Vehicle Infotainment System,243.180.236.29,Opera,Android,Real Estate,71,Nurse,58438.63,Male,Pro,Posted,44.05,53.84,Daytime,81,3.81,Qatar,MXN,Acquisition,371,rocketmail.com,7,Token,1,1,17,0.897,Elders
3,44108303,9012,565.89,2021-02-27 11:31:00,3322,Check,SGP,Purchase,Kiosk,212.186.227.14,Konqueror,CentOS,Appliances,78,Nurse,3426.92,Agender,Premium,Closed,21.7,21.62,Daytime,18,2.67,Spain,CLP,Loan Repayment,687,roadrunner.co.uk,15,Time-Based OTP,1,2,11,0.36,Elders
4,66622683,5185,955.49,2022-09-24 04:06:38,7609,Worldpay,HKG,Acquisition,Smart Mirror,166.113.10.199,Basilisk,Ubuntu,Jewelry,31,Physicist,53080.12,Male,Free,Refunded,56.63,53.71,Daytime,98,3.19,Israel,RUB,Dividend Reinvestment,605,protonmail.co.uk,17,Password,1,9,4,0.895,Young Adult
