In [1]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,Transaction_Type,Amount,initial_balance,new_balance,recipient_initial_balance,recipient_new_balance,Fraud
0,TRANSFER,181.0,181.0,0.0,0.0,0.0,1.0
1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1.0
2,DEBIT,5337.77,41720.0,36382.23,41898.0,40348.79,0.0
3,DEBIT,9644.94,4465.0,0.0,10845.0,157982.12,0.0
4,CASH_OUT,229133.94,15325.0,0.0,5083.0,51513.44,0.0


In [3]:
df.shape

(1587569, 7)

In [4]:
df.isnull().sum()

Transaction_Type             0
Amount                       0
initial_balance              0
new_balance                  0
recipient_initial_balance    0
recipient_new_balance        0
Fraud                        0
dtype: int64

In [5]:
df.dtypes

Transaction_Type              object
Amount                       float64
initial_balance              float64
new_balance                  float64
recipient_initial_balance    float64
recipient_new_balance        float64
Fraud                        float64
dtype: object

In [6]:
df['Fraud'] = df['Fraud'].astype(int)

In [7]:
df.dtypes

Transaction_Type              object
Amount                       float64
initial_balance              float64
new_balance                  float64
recipient_initial_balance    float64
recipient_new_balance        float64
Fraud                          int32
dtype: object

In [8]:
df['Fraud'].value_counts(normalize=True)*100

Fraud
0    99.85777
1     0.14223
Name: proportion, dtype: float64

In [9]:
#define numerical & categorical columns
numeric_features = {feature for feature in df.columns if df[feature].dtype != 'O'}

categorical_features = {feature for feature in df.columns if df[feature].dtype == 'O'}

#print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('We have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 6 numerical features : {'Amount', 'initial_balance', 'recipient_initial_balance', 'recipient_new_balance', 'new_balance', 'Fraud'}
We have 1 categorical features : {'Transaction_Type'}


In [10]:
x = df.drop(columns=['Fraud'], axis=1)
y = df['Fraud']

In [11]:
#Partioning dataset. Due to inbalance dataset value, use stratified sampling
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42,stratify = y)

In [12]:
# Create Column Transformer with 3 types of transformers
num_features = x_train.select_dtypes(exclude="object").columns
cat_features = x_train.select_dtypes(include="object").columns

In [13]:
# Preprocessing of train data

from sklearn.preprocessing import StandardScaler, LabelEncoder

num_features = x_train.select_dtypes(exclude="object").columns
cat_features = x_train.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
le = LabelEncoder()

# Encoding categorical features
for feature in cat_features:
    x_train[feature] = le.fit_transform(x_train[feature])

# Scaling numeric features
x_train[num_features] = numeric_transformer.fit_transform(x_train[num_features])


In [14]:
# Preprocessing of test data

num_features_test = x_test.select_dtypes(exclude="object").columns
cat_features_test = x_test.select_dtypes(include="object").columns

# Encoding categorical features
for feature in cat_features_test:
    x_test[feature] = le.fit_transform(x_test[feature])

# Scaling numeric features
x_test[num_features_test] = numeric_transformer.fit_transform(x_test[num_features_test])

In [16]:
# Label encoding target variable 
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [18]:
model_rf.fit(x_train,y_train)

In [19]:
y_pred=model_rf.predict(x_test)

In [20]:
model_rf.score(x_test,y_test)

0.9989816721992311

In [22]:
from sklearn.metrics import classification_report
from sklearn import metrics
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    475594
           1       1.00      0.28      0.44       677

    accuracy                           1.00    476271
   macro avg       1.00      0.64      0.72    476271
weighted avg       1.00      1.00      1.00    476271



In [23]:
df.dtypes

Transaction_Type              object
Amount                       float64
initial_balance              float64
new_balance                  float64
recipient_initial_balance    float64
recipient_new_balance        float64
Fraud                          int32
dtype: object

In [25]:
import pickle

In [26]:
# Create label encoder mappings
label_encoder_mappings = {}

# Define categorical features and their mappings
categorical_features = ['Transaction_Type']
for feature in categorical_features:
    label_encoder = LabelEncoder()
    label_encoder_mappings[feature] = label_encoder

# Save the label encoder mappings to a file
with open('label_encoder_mappings2.pkl', 'wb') as file:
    pickle.dump(label_encoder_mappings, file)

In [27]:
filename = 'fraud_predict.pkl'

pickle.dump(model_rf, open(filename, 'wb'))

In [28]:
df.head()

Unnamed: 0,Transaction_Type,Amount,initial_balance,new_balance,recipient_initial_balance,recipient_new_balance,Fraud
0,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
2,DEBIT,5337.77,41720.0,36382.23,41898.0,40348.79,0
3,DEBIT,9644.94,4465.0,0.0,10845.0,157982.12,0
4,CASH_OUT,229133.94,15325.0,0.0,5083.0,51513.44,0


In [29]:
df['Transaction_Type'].value_counts()

Transaction_Type
CASH_OUT    848913
CASH_IN     525111
TRANSFER    197767
DEBIT        15778
Name: count, dtype: int64

In [31]:
df.describe()

Unnamed: 0,Amount,initial_balance,new_balance,recipient_initial_balance,recipient_new_balance,Fraud
count,1587569.0,1587569.0,1587569.0,1587569.0,1587569.0,1587569.0
mean,233645.0,1240285.0,1275518.0,1496481.0,1666234.0,0.0014223
std,301151.8,3518749.0,3559572.0,2687266.0,2768032.0,0.03768658
min,0.37,0.0,0.0,0.0,0.0,0.0
25%,77750.92,0.0,0.0,140046.5,228847.1,0.0
50%,162625.7,18075.28,0.0,554866.1,708717.4,0.0
75%,284506.2,199616.0,292653.1,1654012.0,1900449.0,0.0
max,10000000.0,38939420.0,38946230.0,42283780.0,42655770.0,1.0


In [40]:
print(max(df['recipient_new_balance']))
print(min(df['recipient_new_balance']))
print(df['recipient_new_balance'].mean())

42655769.2
0.0
1666233.6589790047
