In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('AIML Dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df.groupby('type')['isFraud'].mean()

In [None]:
# df['type'].groupby(df['isFraud']).value_counts()
df_fraud_value_count = df[df['isFraud']==1]['type'].value_counts()
df_fraud_value_count


In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x=df_fraud_value_count.index, y=df_fraud_value_count.values)
plt.title('Fraud Transaction Types')
plt.xlabel('Transaction Type')
plt.ylabel('Number of Fraud Transactions')
plt.show()

In [None]:
df_fraud_over_time = df.groupby('step')['isFraud'].sum()
# df_fraud_over_time.plot(figsize=(12,6))
# plt.title('Fraud Transactions Over Time')
# plt.xlabel('Time Step')
# plt.ylabel('Number of Fraud Transactions')
# plt.show()

plt.figure(figsize=(12,6))
sns.lineplot(x=df_fraud_over_time.index, y=df_fraud_over_time.values)
plt.title('Fraud Transactions Over Time')
plt.xlabel('Time Step')
plt.ylabel('Number of Fraud Transactions')
plt.show()

In [None]:
df['amount'].hist(bins=50, figsize=(10,6))
plt.title('Transaction Amount Distribution')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.boxplot(x='isFraud', y='amount', data=df)
plt.title('Transaction Amount by Fraud Status')
plt.xlabel('Is Fraud')
plt.ylabel('Amount')
plt.show()

In [None]:
df['amount_bin']=pd.qcut(df['amount'],10)
fraud_rate=df.groupby('amount_bin')['isFraud'].mean()
fraud_rate.plot(kind='bar', figsize=(12,6))
plt.title('Fraud Rate by Transaction Amounts')
plt.ylabel('Fraud Rate')
plt.show()

In [None]:
sns.scatterplot(x='oldbalanceOrg', y='amount', hue='isFraud', data=df.sample(50000))
plt.title('Old Balance vs Amount by Fraud Status')
plt.show()

In [None]:
df_origin_in_fraud = df[df['isFraud']==1]['nameOrig'].value_counts().head(10)
df_origin_in_fraud.plot(kind='bar', figsize=(12,6))
plt.title('Top 10 Originating Accounts in Fraud Transactions')
plt.xlabel('Account')
plt.ylabel('Number of Fraud Transactions')
plt.show()

In [None]:
df_dest_in_fraud = df[df['isFraud']==1]['nameDest'].value_counts().head(10)
df_dest_in_fraud.plot(kind='bar', figsize=(12,6))
plt.title('Top 10 Destination Accounts in Fraud Transactions')
plt.xlabel('Account')
plt.ylabel('Number of Fraud Transactions')
plt.show()

In [None]:
df_origin_in_fraud

In [None]:
df_dest_in_fraud

In [None]:
pd.crosstab(df['isFraud'], df['isFlaggedFraud'])
# 8197 Fradulent transactions were not flagged as fraud in the dataset

In [None]:
df[(df['oldbalanceOrg'] == 0) & (df['amount'] > 0)].head()


In [None]:
df[(df['newbalanceDest'] == df['oldbalanceDest']) & (df['amount'] > 0)].head()
# df[['oldbalanceOrg', 'newbalanceOrig', 'amount']].head()

In [None]:
df.corr(numeric_only=True).style.background_gradient(cmap='coolwarm')


In [None]:
train_set=df[df['step']<df['step'].max() * 0.8]
test_set=df[df['step']>=df['step'].max() * 0.8]
train_set=train_set.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
test_set=test_set.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])

x_train=train_set.drop(['isFraud', 'amount_bin'],axis=1)
x_test=test_set.drop(['isFraud', 'amount_bin'],axis=1)
y_train=train_set['isFraud']
y_test=test_set['isFraud']


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [None]:
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
categorical_features = [feature for feature in x_train.columns if x_train[feature].dtype == 'object']
numeric_features = [feature for feature in x_train.columns if feature not in categorical_features]
print(f'{categorical_features}, {numeric_features}')

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', onehot, categorical_features)
])
preprocessor

#Random Forest

In [None]:
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
rf_pipe

In [None]:
rf_pipe.fit(x_train, y_train)

In [None]:
rf_y_pred = rf_pipe.predict(x_test)
rf_y_proba = rf_pipe.predict_proba(x_test)[:, 1]

In [None]:
print(f'----- Random Forest -----')
print(classification_report(y_test, rf_y_pred))
print(f'ROC-AUC: {roc_auc_score(y_test, rf_y_proba):.4f}\n')

In [None]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score


# models = {
#     'Logistic Regression': LogisticRegression(max_iter=2000),
#     'LightGBM': LGBMClassifier(class_weight='balanced'),
#     # 'Random Forest': RandomForestClassifier(class_weight='balanced')
# }

# for name, model in models.items():
#     pipe = Pipeline(steps=[
#         ('Preprocessor', preprocessor),
#         ('Model', model)
#     ])
    
#     # Train the model
#     pipe.fit(x_train, y_train)
    
#     # Predict on test data
#     y_pred = pipe.predict(x_test)
#     y_proba = pipe.predict_proba(x_test)[:, 1]  # needed for ROC-AUC
    
#     # Evaluate
#     print(f'----- {name} -----')
#     print(classification_report(y_test, y_pred))
#     print(f'ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}\n')

#LOGISTIC REGRESSION

In [None]:
lr_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=2000))
])
lr_pipe

In [None]:
lr_pipe.fit(x_train, y_train)

In [None]:
lr_y_pred = lr_pipe.predict(x_test)
lr_y_proba = lr_pipe.predict_proba(x_test)[:, 1]

In [None]:
print(f'----- Logistic Regression -----')
print(classification_report(y_test, lr_y_pred))
print(f'ROC-AUC: {roc_auc_score(y_test, lr_y_proba):.4f}\n')

#LIGHTGBM

In [None]:
lightgbm_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(class_weight='balanced'))
])
lightgbm_pipe

In [None]:
lightgbm_pipe.fit(x_train, y_train)


In [None]:
lightgbm_y_pred = lightgbm_pipe.predict(x_test)
lightgbm_y_proba = lightgbm_pipe.predict_proba(x_test)[:, 1]

In [None]:
print(f'----- LightGBM Classifier -----')
print(classification_report(y_test, lightgbm_y_pred))
print(f'ROC-AUC: {roc_auc_score(y_test, lightgbm_y_proba):.4f}\n')

In [None]:
import joblib
joblib.dump(rf_pipe, 'fraud_detection_model_Random-forest.pkl')
joblib.dump(lightgbm_pipe, 'fraud_detection_model_LightGBM.pkl')
joblib.dump(lr_pipe, 'fraud_detection_model_Logistic-Regression.pkl')