<a href="https://colab.research.google.com/github/KithminiP/Gambling_Transaction_Detection/blob/main/Gambling_Transaction_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker

Collecting faker
  Downloading Faker-36.1.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-36.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-36.1.0


In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Initialize the Faker library for generating random names and other information
fake = Faker()

In [None]:
# Function to generate synthetic transaction data
def generate_synthetic_data(num_rows=1000):
    data = []

    categories = ['Gambling', 'Shopping', 'Utilities', 'Groceries', 'Entertainment', 'Dining Out']
    merchants = ['CasinoX', 'Walmart', 'Amazon', 'Netflix', 'Starbucks', 'McDonalds']

    for _ in range(num_rows):
        # Generate synthetic data
        transaction_id = fake.uuid4()
        customer_id = fake.uuid4()
        amount = round(random.uniform(5.0, 1000.0), 2)  # Transaction amount between $5 and $1000
        category = random.choice(categories)  # Random category from predefined list
        merchant = random.choice(merchants)  # Random merchant from predefined list
        transaction_date = fake.date_this_year()  # Random date within the current year
        income = round(random.uniform(20000, 150000), 2)  # Random income between $20,000 and $150,000
        age = random.randint(18, 75)  # Random age between 18 and 75

     # Append the generated row to the data list
        data.append([transaction_id, customer_id, amount, category, merchant, transaction_date, income, age])

    # Create a DataFrame
    df = pd.DataFrame(data, columns=[
        'Transaction ID', 'Customer ID', 'Amount', 'Category', 'Merchant', 'Date', 'Income', 'Age'
    ])

    # Create a binary target column 'Is Gambling'
    df['Is Gambling'] = df['Category'].apply(lambda x: 1 if x == 'Gambling' else 0)

    return df

In [None]:
# Generate synthetic data
synthetic_data = generate_synthetic_data(num_rows=1000)


In [None]:
# Save to CSV
synthetic_data.to_csv('synthetic_transaction_data.csv', index=False)

In [None]:
# Display the first few rows of the synthetic data
synthetic_data.head()

Unnamed: 0,Transaction ID,Customer ID,Amount,Category,Merchant,Date,Income,Age,Is Gambling
0,0a7c9473-5e96-46cc-81dc-da5eafafe2c7,20af06f5-4eea-44ae-b8ad-ace604de9198,991.56,Dining Out,McDonalds,2025-01-10,103095.42,35,0
1,edbbe2ea-9ca7-4608-9679-c54b039f970a,27993c76-9118-4d4d-af24-785385f00ed6,811.86,Groceries,Amazon,2025-01-17,66022.76,45,0
2,1ee14e4b-fcb9-439b-89a1-a4adce3fe2cf,f91f4fcf-a7e7-45f8-bc3f-5dd9d8c323b0,109.41,Gambling,Amazon,2025-02-03,94673.4,31,1
3,80f71830-30b2-4d76-818e-01b28a8c82ee,aaf9cf56-ceff-4d53-9d96-82573184ac48,990.83,Utilities,Walmart,2025-01-20,33660.75,20,0
4,c7233936-2071-4cd8-9f0f-9abc950cb7e1,4cc186b7-05f2-4b0a-a9a2-53a24d1e4194,722.5,Dining Out,Netflix,2025-01-09,99173.88,56,0


In [None]:
# Feature selection
X = synthetic_data[['Amount', 'Income', 'Age']]  # Features
y = synthetic_data['Is Gambling']  # Target (1 = gambling, 0 = not gambling)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8333333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       251
           1       0.33      0.02      0.04        49

    accuracy                           0.83       300
   macro avg       0.59      0.51      0.47       300
weighted avg       0.76      0.83      0.77       300



In [None]:
# Calculate the ROC-AUC score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"ROC-AUC: {roc_auc}")

ROC-AUC: 0.5653711683876738


The ROC-AUC score of 0.565 indicates that the model's ability to distinguish between gambling and non-gambling transactions is slightly better than random guessing, but there's still considerable room for improvement. An ideal ROC-AUC score would be closer to 1, and a score of 0.5 would suggest that the model has no discriminatory power.

**Hyperparameter tuning**

In [None]:
# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]  # You can experiment with class weights
}

In [None]:
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(rf_model, param_grid, n_iter=10, cv=5, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.4s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.8s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END class_weight=None, max_depth=None, min_samples_leaf=4, min_samples_split=1

In [None]:
# Get the best model
best_model = random_search.best_estimator_

In [None]:
# Evaluate the best model
y_pred_best = best_model.predict(X_test)

In [None]:
print(f"Best Model Accuracy: {accuracy_score(y_test, y_pred_best)}")
print("Best Model Classification Report:")
print(classification_report(y_test, y_pred_best))

Best Model Accuracy: 0.8366666666666667
Best Model Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       251
           1       0.00      0.00      0.00        49

    accuracy                           0.84       300
   macro avg       0.42      0.50      0.46       300
weighted avg       0.70      0.84      0.76       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
