<a href="https://colab.research.google.com/github/GraciousWeb/credit-risk-fraud-detection/blob/main/credit_risk_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

In [23]:
df = pd.read_csv('fraudTrain.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [24]:
print(df.columns)

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [25]:
df.shape

(105015, 23)

# From this dataset, these are the necessary information needed to predict if fraud occured:


==>  amount, city population, age, hour of transaction, distance of merchant from customer, merchant risk, category of transaction, gender




In [26]:
#extract the hour of transaction from the transaction date and time
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour

In [27]:
#calculate the age of customer
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year #subtract the year in dob from the year in the transaction date

In [28]:
#calculate the distance between merchant and customer using the haversine formula
import numpy as np

def haversine_distance(lat1, lon1, lat2, lon2):
    #radius of the Earth in miles
    r = 3958.8

    #convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    #differences
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    #haversine formula math
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return c * r

#create the 'distance' column
df['distance'] = haversine_distance(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

In [29]:
columns_to_drop = ['Unnamed: 0', 'trans_num', 'cc_num', 'first', 'last', 'unix_time', 'street', 'zip', 'city',
                   'state', 'lat', 'long', 'dob', 'trans_date_trans_time', 'merch_lat', 'merch_long']
df.drop(columns=columns_to_drop, inplace=True)
df.head()

Unnamed: 0,merchant,category,amt,gender,city_pop,job,is_fraud,hour,age,distance
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,3495,"Psychologist, counselling",0.0,0,31,48.838809
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,149,Special educational needs teacher,0.0,0,41,18.773185
2,fraud_Lind-Buckridge,entertainment,220.11,M,4154,Nature conservation officer,0.0,0,57,67.236892
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,1939,Patent attorney,0.0,0,52,59.449252
4,fraud_Keeling-Crist,misc_pos,41.96,M,99,Dance movement psychotherapist,0.0,0,33,48.192064


In [30]:
df = df.dropna(subset=['is_fraud'])

In [31]:
#split between training and test sets
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

In [32]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#stratify is used to keep the training and test sets(validation set in this case) in the target variable proportional in the case of class imbalance

##Perform target encoding to create merchant_risk and job_risk columns

In [33]:
#create a mapping for merchant risk
merchant_risk = y_train.groupby(X_train['merchant']).mean()

X_train['merchant_risk'] = X_train['merchant'].map(merchant_risk)
X_val['merchant_risk'] = X_val['merchant'].map(merchant_risk)

In [34]:
print(merchant_risk)

merchant
fraud_Abbott-Rogahn                 0.007042
fraud_Abbott-Steuber                0.008929
fraud_Abernathy and Sons            0.009524
fraud_Abshire PLC                   0.000000
fraud_Adams, Kovacek and Kuhlman    0.018519
                                      ...   
fraud_Zemlak Group                  0.015152
fraud_Zemlak, Tillman and Cremin    0.000000
fraud_Ziemann-Waters                0.000000
fraud_Zieme, Bode and Dooley        0.010471
fraud_Zulauf LLC                    0.000000
Name: is_fraud, Length: 693, dtype: float64


In [35]:
X_val['merchant_risk'] = X_val['merchant_risk'].fillna(y_train.mean())
#considering that you are splitting by rows and not by merchants, some merchants may be in the validation set but not in the training set
#in this case, if the model finds a new merchant in the validation set, do not disregard them, instead replace them with the average fraud risk of any transaction

In [36]:
job_risk = y_train.groupby(X_train['job']).mean()
X_train['job_risk'] = X_train['job'].map(job_risk)
X_val['job_risk'] = X_val['job'].map(job_risk)

In [37]:
X_val['job_risk'] = X_val['job_risk'].fillna(y_train.mean())

In [38]:
#drop the old text columns
X_train = X_train.drop(['merchant', 'job'], axis=1)
X_val = X_val.drop(['merchant', 'job'], axis=1)

In [39]:
X_train = pd.get_dummies(X_train, columns=['category', 'gender'])
X_val = pd.get_dummies(X_val, columns=['category', 'gender'])

X_train, X_val = X_train.align(X_val, join='left', axis=1, fill_value=0) #this means align X_tain and X_val in such a way that both
#tables have the exact same labels. Use the left column as the reference. So it aligns X-val with X_train and drop extra columns in X_val

In [40]:
print(X_val.dtypes)

amt                        float64
city_pop                     int64
hour                         int32
age                          int32
distance                   float64
merchant_risk              float64
job_risk                   float64
category_entertainment        bool
category_food_dining          bool
category_gas_transport        bool
category_grocery_net          bool
category_grocery_pos          bool
category_health_fitness       bool
category_home                 bool
category_kids_pets            bool
category_misc_net             bool
category_misc_pos             bool
category_personal_care        bool
category_shopping_net         bool
category_shopping_pos         bool
category_travel               bool
gender_F                      bool
gender_M                      bool
dtype: object


In [41]:
#Scaling and KNN imputation
#because KNN requires all values to be on the same scale (0-1) that is why we are starting with scaling, MinMax scaling in particular
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

#Apply KNN Imputer to fill missing values
imputer = KNNImputer(n_neighbors=5)
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_val_imputed = imputer.transform(X_val_scaled)

In [42]:
#Use Isolation Forest to perform anomaly detection. It assigns an anomaly score when anomalies are detected
iso_forest = IsolationForest(contamination=0.01, random_state=42) #The contamination rate is your estimate of how much data is anomalous. 0.01 means 1%

train_outliers = iso_forest.fit_predict(X_train_imputed)
val_outliers = iso_forest.predict(X_val_imputed)

X_train_final = np.column_stack([X_train_imputed, (train_outliers == -1).astype(int)]) #if train_outliers == -1 will give "True/False" and  will be converted to 1/0 binary
X_val_final = np.column_stack([X_val_imputed, (val_outliers == -1).astype(int)]) #sames applies here

In [43]:
#perform SMOTE for class imbalance
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = sm.fit_resample(X_train_final, y_train)
print(f"Original fraud count: {sum(y_train == 1)}")
print(f"New fraud count after SMOTE: {sum(y_train_resampled == 1)}")

Original fraud count: 824
New fraud count after SMOTE: 83187


In [44]:
#Train with XGBOOST
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

y_pred_xgb = xgb_model.predict(X_val_final) #predict gives a single yes or no answer to whether the transaction is fraud or not
y_proba_xgb = xgb_model.predict_proba(X_val_final)[:, 1] #predict_proba = how sure are you that this is fraud? 0.01 means 1 percent chance
# predict_proba returns a table with the probability of not fraud first and then the probability of fraud.
# [:, 1] means, give me the first index which is the fraud column

In [45]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42)

svm_model.fit(X_train_resampled, y_train_resampled)

y_pred_svm = svm_model.predict(X_val_final)
y_proba_svm = svm_model.predict_proba(X_val_final)[:, 1]

In [46]:
from sklearn.metrics import classification_report, precision_recall_curve, auc

precision, recall, _ = precision_recall_curve(y_val, y_proba_xgb)
auc_score = auc(recall, precision)

print(f"XGBoost PR-AUC: {auc_score:.4f}")
print("\nClassification Report for XGBoost:")
print(classification_report(y_val, y_pred_xgb))

XGBoost PR-AUC: 0.8444

Classification Report for XGBoost:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     20797
         1.0       0.80      0.80      0.80       206

    accuracy                           1.00     21003
   macro avg       0.90      0.90      0.90     21003
weighted avg       1.00      1.00      1.00     21003

