In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
df = pd.read_csv("D:/Semester 6/Artificial Intelligence/AI Project/Synthetic_Financial_datasets_log.csv")

In [14]:
try:
    # Attempt to select features
    X = df[["amount", "oldbalanceOrg", "newbalanceOrig"]]
    print("Selected features:", X.columns.tolist())
except KeyError:
    print("Error: Features not found in DataFrame. Available columns:")
    print(df.columns.tolist())  # Print available columns for reference

Selected features: ['amount', 'oldbalanceOrg', 'newbalanceOrig']


In [15]:
# Seperating data for analysis
legit = df[df.isFlaggedFraud == 0] # Entire row will be stored in legit
fraud = df[df.isFlaggedFraud == 1]
# Pandas Series

In [16]:
print(legit.shape)
print(fraud.shape)

(6362604, 11)
(16, 11)


In [18]:
# Statistical measures about the data
legit.amount.describe()
# Mean transaction is about $88
# 25% of transaction amount are less than $5.65
# 50% of transaction amount are less than $22

count    6.362604e+06
mean     1.798501e+05
std      6.037884e+05
min      0.000000e+00
25%      1.338955e+04
50%      7.487127e+04
75%      2.087205e+05
max      9.244552e+07
Name: amount, dtype: float64

In [20]:
fraud.amount.describe()
# Mean transaction is about $122.211
# Higher than normal transcation
# Important insight

count    1.600000e+01
mean     4.861598e+06
std      3.572499e+06
min      3.538742e+05
25%      2.242749e+06
50%      4.234245e+06
75%      7.883451e+06
max      1.000000e+07
Name: amount, dtype: float64

In [30]:
# Dealing with unbalanced data3
# We will use undersampling
# Build sample dataset containing similar distribution of normal transcations and fraud transaction
# Take randomly 492 transaction from legit one
# Then we will have 492 normal transaction and 492 fraudlent transcation
# Very good dataset
# We can make better prediction using Machine Learning
legit_sample = legit.sample(n=10000)
# Random 492 data points

In [31]:
# Concatenating two data frames (legit_sample and fraud)
new_dataset = pd.concat([legit_sample, fraud], axis=0)
# Frames will be added one by one
# axis = 0 row wise
# axis = 1 column wise
# all 492 will be added below legit_sample

In [32]:
new_dataset['isFlaggedFraud'].value_counts()

0    10000
1       16
Name: isFlaggedFraud, dtype: int64

In [33]:
X = new_dataset[["amount", "oldbalanceOrg", "newbalanceOrig"]]
X_train, X_test, y_train, y_test = train_test_split(X, new_dataset['isFraud'], test_size=0.2, random_state=42)

# Define the Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on unseen data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test)

# Print all the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9995
Precision: 0.7500
Recall: 1.0000
F1-score: 0.8571
