<a href="https://colab.research.google.com/github/MatiasSiles/MastercardSalesOptimization/blob/main/Fraud_Detection_Transactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modelo de detección temprana de anomalías y fraudes financieros multivariado con series temporales, usando Deep Learning (LSTM/Transformer) y métodos bayesianos para estimar riesgo dinámico en transacciones Mastercard a nivel global.

Detección en tiempo real de fraudes invisibles de bajo monto, usando anomalías multivariada

In [16]:
import sqlite3
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
connection_db = sqlite3.connect("mastercard.db")

df_customers = pd.read_sql("SELECT * FROM customers", connection_db)
df_cards = pd.read_sql("SELECT * FROM cards", connection_db)
df_merchants = pd.read_sql("SELECT * FROM merchants", connection_db)
df_transactions = pd.read_sql("SELECT * FROM transactions", connection_db)
df_fraud_labels = pd.read_sql("SELECT * FROM fraud_labels", connection_db)

connection_db.close()

In [None]:
# @title
class fraud_analyzer():
  def __init__(self, * ,customers=None, transactions=None, cards=None, fraud_labels=None, merchants=None):
    self.customers = customers
    self.transactions = transactions
    self.cards = cards
    self.fraud_labels = fraud_labels
    self.merchants = merchants

  def Customers(self):
    fig, axes = plt.subplots(3,2, figsize=(16,8))
    fig.suptitle("Customers Analysis")

    # plot1
    axes[0,0].scatter(self.customers["age"], self.customers["income"], alpha=0.4)
    axes[0,0].set_xlabel("Age")
    axes[0,0].set_ylabel("Income")

    # plot2
    idcustomer_idtransaction_amount = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id", "transaction_id","amount"]]
    income_vs_amount = pd.merge(idcustomer_idtransaction_amount, self.customers, on="customer_id")[["income", "amount"]]

    axes[0,1].scatter(income_vs_amount["income"], income_vs_amount["amount"], alpha=0.2)
    axes[0,1].set_xlabel("Income")
    axes[0,1].set_ylabel("Transaction Amount")

    # plot3
    axes[1,0].hist(idcustomer_idtransaction_amount["customer_id"], bins=50)
    axes[1,0].set_xlabel("Customer ID")
    axes[1,0].set_ylabel("Transaction Frequency")

    # plot4
    data = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id","amount"]]
    data = data.groupby("customer_id").mean()

    axes[1,1].scatter(data.index, data["amount"], alpha=0.3)
    axes[1,1].set_xlabel("Customer ID")
    axes[1,1].set_ylabel("Average Transaction Amount")

    # plot5
    self.transactions["timestamp"] = pd.to_datetime(self.transactions["timestamp"])
    frequency_transaction_hour = self.transactions["timestamp"].dt.hour.sort_values()

    axes[2,0].hist(frequency_transaction_hour)
    axes[2,0].set_xlabel("Hour")
    axes[2,0].set_ylabel("Transaction Frequency")

    # plot6
    frequency_transaction_date = self.transactions["timestamp"].sort_values()
    frequency_transaction_date = self.transactions["timestamp"].dt.date
    frequency_transaction_date = pd.to_datetime(self.transactions["timestamp"])

    axes[2,1].hist(frequency_transaction_date, bins=80)
    axes[2,1].set_xlabel("Date")
    axes[2,1].set_ylabel("Transaction Frequency")

    fig.tight_layout()

  def transaction_customer_map(self):
    world_map = gpd.read_file("/content/ne_110m_admin_0_countries.shp")

    countries = self.customers["country"].value_counts().reset_index().rename(columns={"country":"ISO_A2"})
    world_map = pd.merge(world_map, countries, on="ISO_A2")

    world_map.plot(column="count", cmap="OrRd", legend=True, color="lightblue", figsize=(20,8))

  def fraud_distribution(self):
    data = self.fraud_labels["is_fraud"].value_counts()
    data.plot(kind="bar")
    frauds_total = data.iloc[data.index == 1].item()
    fraud_porcent = (frauds_total * 100) / len(self.fraud_labels)

    print(data)
    print(f"\nFraud Porcent: {fraud_porcent}")

  class check_customers_cards():
    def __init__(self, cards):
      fraud_analyzer.cards

    def status():
      data = fraud_analyzer.cards["status"].value_counts().plot(kind="bar")
      plt.xlabel("")
      plt.ylabel("Number of Customers")

    def card_types():
      data = fraud_analyzer.cards["card_type"].value_counts().plot(kind="bar")
      plt.ylabel("Number of Customers")

    def seeker():

      while True:

        prompt = int(input("Enter the client id or 0 to exit: "))
        print()

        if prompt == 0:
          break

        elif prompt in fraud_analyzer.cards["customer_id"].unique():
          print(fraud_analyzer.cards[fraud_analyzer.cards["customer_id"] == prompt])
          print("\n\n")

        else:
          print("Customer not found")
          print("\n\n")

  class Merchants():
    def __init__(self, merchants):
      fraud_analyzer.merchants

    def sold_categories_merchants():
      fraud_analyzer.merchants["category"].value_counts().plot(kind="bar")

    def most_dangerous_merchants():
      fraud_analyzer.merchants["risk_score"].plot(kind="hist") # risk_score indicate how many historical frauds has every merchant
      plt.xlabel("Risk Score")
      plt.title("Distribution Risks Scores by Merchants")

      print("\nTop 10 most dangerous merchants:")
      print(fraud_analyzer.merchants.sort_values(by="risk_score", ascending=False).head(10))

fraud_analyzer = fraud_analyzer(customers=df_customers, transactions=df_transactions, cards=df_cards, fraud_labels=df_fraud_labels, merchants=df_merchants)

In [21]:
class backend_models():
  def __init__(self, data, model, metrics):
    self.data = pd.read_csv(data)
    self.model = model
    self.metrics = metrics

  def data_upload(self):
    # self.data = pd.read_csv(self.data)
    self.X = self.data.drop(["customer_id", "card_id",  "merchant_id",  "transaction_id", "target"], axis=1)
    self.y = self.data["target"]

  def data_split(self):
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

  def balanced_data(self):
    smote = SMOTE(random_state=42)
    self.X_train_balanced, self.y_train_balanced = smote.fit_resample(self.X_train, self.y_train)

  def preprocessing(self):
    scaler = StandardScaler()
    self.X_train_scaled = scaler.fit_transform(self.X_train)
    self.X_test_scaled = scaler.transform(self.X_test)

    if "Regression" not in str(self.model):
      self.X_train_balanced = scaler.fit_transform(self.X_train_balanced)
      self.y_train_balanced = scaler.transform(self.y_train_balanced)

  def model_train(self):
    model_trained = self.model.fit(self.X_train_scaled, self.y_train)
    self.model = model_trained

    if "Regression" not in str(self.model):
      model_trained = self.model.fit(self.X_train_balanced, self.y_train_balanced)
      self.model_balanced = model_trained

  def model_predict(self):
    self.prediction = self.model.predict(self.X_test_scaled)

    if "Regression" not in str(self.model):
      self.prediction_balanced = self.model_balanced.predict(self.X_test_scaled)

  def model_evaluate(self):
    for metric in self.metrics:
      print(f"{metric}: {eval(metric)(self.y_test, self.prediction):.2f}")

    if "Regression" not in str(self.model):
      # print("\n\n")
      for metric in self.metrics:
        print(f"{metric}: {eval(metric)(self.y_test, self.prediction_balanced):.2f}")

  def ask_tuning(self):
    print("\n1. Yes\n2. No\n")
    choice = int(input("It's necessary tuning?: "))

    if choice == 1:
      params = eval(input("Enter de params like a dict: ")) # eval convert string to object to execute
      grid = GridSearchCV(self.model, params, cv=5)
      grid.fit(self.X_train_scaled, self.y_train)
      best_model = grid.best_estimator_
      print(f"best params: {grid.best_params_}\n")
      print(f"best score: {grid.best_score_}\n")

    else:
      pass

  def predictions(self):
    self.X = StandardScaler().fit_transform(self.X)
    prediction = self.model.predict(self.X)
    self.data["preds_reg_logistic"] = prediction

    if "Regression" not in str(self.model):
      prediction_balanced = self.model_balanced.predict(self.X)
      self.data["preds_reg_logistic"] = prediction_balanced

    # ignore
    self.summary_customers_frauds = self.data[self.data["preds_reg_logistic"] == 1].groupby(
        ["customer_id", "card_id", "income",
         "age", "issue_year","target"])[["merchant_id", "transaction_id",
                                         "amount", "timestamp_hours",   "risk_score", "Active", "Blocked", "Expired", "preds_reg_logistic"]].count()

  def run_model(self):
    self.data_upload()
    self.data_split()
    self.preprocessing()
    self.model_train()
    self.model_predict()
    self.model_evaluate()
    self.ask_tuning()

    return self.predictions()

  def run_desbalanced_models(self):
    self.data_upload()
    self.data_split()
    self.balanced_data()
    self.preprocessing()
    self.model_train()
    self.model_predict()
    self.model_evaluate()
    self.ask_tuning()

    return self.predictions()


In [23]:
class models(backend_models):
  def __init__(self, *, data, model, metrics):
    super().__init__(data, model, metrics)

  def logistic_regression():
    lr = backend_models(
        data="/content/clean_data_customers_for_predict.csv",
        model=LogisticRegression(C= 10, class_weight = "balanced", penalty = "l1", solver = "liblinear"),
        metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

    return lr.run_model()

  def balanced_random_forest():
    rf = backend_models(
        data="/content/clean_data_customers_for_predict.csv",
        model=BalancedRandomForestClassifier(n_estimators=100, max_depth=None, class_weight = "balanced", random_state=42),
        metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

    return rf.run_model()

  def xgboost():
    xgb = backend_models(
        data="/content/clean_data_customers_for_predict.csv",
        model=XGBClassifier(),
        metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

    return xgb.run_desbalanced_models()

  def lightgbm():
    lgbm = backend_models(
        data="/content/clean_data_customers_for_predict.csv",
        model=LGBMClassifier(verbose=-1),
        metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

    return lgbm.run_desbalanced_models()

  # def comparision_trees_models():
  #   rf = backend_models(
  #       data="/content/clean_data_customers_for_predict.csv",
  #       model=RandomForestClassifier(class_weight = "balanced"),
  #       metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

  #   xgb = backend_models(
  #       data="/content/clean_data_customers_for_predict.csv",
  #       model=XGBClassifier(),
  #       metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

  #   lgbm = backend_models(
  #       data="/content/clean_data_customers_for_predict.csv",
  #       model=LGBMClassifier(verbose=-1),
  #       metrics=["precision_score", "recall_score", "f1_score", "roc_auc_score"])

  #   return rf.run_model()

  def neural_networks():
    pass

# balanced because i have 10k labels and there is 219 frauds, that's very desbalanced and probably the model don't will predict good
# i also use "liblinear" for linear problems
# C = regularized the punishment level the target labels when the model fails in some things
# penalty l1 or l2 is the punishment type, this take into account the punishment level from C

In [24]:
models.xgboost()

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [20]:
a = "regressionmodel"
b = BalancedRandomForestClassifier()

"Regression" not in str(b)

True