In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import time


In [2]:
# Load data from CSV
df = pd.read_csv('Fraud Detection on Bank Payments.csv')  # Replace 'path_to_your_file.csv' with your actual file path
print(df.info())
# Separate features and target variable
X = df.drop(columns=['fraud'])  # Features
y = df['fraud']  # Target variable

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 594643 entries, 0 to 594642
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   step         594643 non-null  int64  
 1   customer     594643 non-null  object 
 2   age          594643 non-null  object 
 3   gender       594643 non-null  object 
 4   zipcodeOri   594643 non-null  object 
 5   merchant     594643 non-null  object 
 6   zipMerchant  594643 non-null  object 
 7   category     594643 non-null  object 
 8   amount       594643 non-null  float64
 9   fraud        594643 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 45.4+ MB
None


In [3]:
len(df.index)

594643

In [8]:
# Convert categorical variables to numerical using one-hot encoding
categorical_features = ['customer', 'age', 'gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category']
numeric_features = ['step', 'amount']

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Example model: RandomForest with class weighting
rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Gradient Boosting model
gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

# Train the model
start_time = time.time()
gb.fit(X_train, y_train)
end_time = time.time()

# Predict and evaluate
y_pred_gb = gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)
cm_gb = confusion_matrix(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {accuracy_gb}")
print(f"Precision: {precision_gb}")
print(f"Recall: {recall_gb}")
print(f"F1 Score: {f1_gb}")
print(f"Confusion Matrix: \n{cm_gb}")
print(f"Time taken: {end_time - start_time} seconds")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.9961994024429209
Precision: 0.8946212952799122
Recall: 0.7703213610586012
F1 Score: 0.8278313864906044
Confusion Matrix: 
[[176085    192]
 [   486   1630]]
Time taken: 117.67541146278381 seconds

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    176277
           1       0.89      0.77      0.83      2116

    accuracy                           1.00    178393
   macro avg       0.95      0.88      0.91    178393
weighted avg       1.00      1.00      1.00    178393



In [None]:
# Train the model
start_time = time.time()
rf.fit(X_train, y_train)
end_time = time.time()

# Predict and evaluate
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
print(f"RandomForest Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Confusion Matrix: \n{cm}")
print(f"Time taken: {end_time - start_time} seconds")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Example input for prediction (more extreme values)
input_data_fraud = pd.DataFrame({
    'step': [300],  # A later step in the simulation
    'customer': ['C9999999999'],  # Assumed to be a new or rare customer ID
    'age': ['1'],  # Very young age group
    'gender': ['F'],
    'zipcodeOri': ['99999'],  # Uncommon zip code
    'merchant': ['M999999999'],  # Assumed to be a new or rare merchant ID
    'zipMerchant': ['99999'],  # Uncommon merchant zip code
    'category': ['es_other'],  # Assumed less common category
    'amount': [1000000.00]  # Extremely high amount
})

# Preprocess input data using the same preprocessor fitted on training data
input_data_fraud_transformed = rf.named_steps['preprocessor'].transform(input_data_fraud)

# Predict with the trained model
predicted_class_fraud = rf.named_steps['classifier'].predict(input_data_fraud_transformed)

# Print result based on prediction
if predicted_class_fraud[0] == 0:
    print("Fraud is not detected")
else:
    print("Fraud is detected")