<a href="https://colab.research.google.com/github/MatiasSiles/MastercardSalesOptimization/blob/main/Fraud_Detection_Transactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modelo de detección temprana de anomalías y fraudes financieros multivariado con series temporales, usando Deep Learning (LSTM/Transformer) y métodos bayesianos para estimar riesgo dinámico en transacciones Mastercard a nivel global.

Detección en tiempo real de fraudes invisibles de bajo monto, usando anomalías multivariada

In [57]:
import sqlite3
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [39]:
connection_db = sqlite3.connect("mastercard.db")

df_customers = pd.read_sql("SELECT * FROM customers", connection_db)
df_cards = pd.read_sql("SELECT * FROM cards", connection_db)
df_merchants = pd.read_sql("SELECT * FROM merchants", connection_db)
df_transactions = pd.read_sql("SELECT * FROM transactions", connection_db)
df_fraud_labels = pd.read_sql("SELECT * FROM fraud_labels", connection_db)

connection_db.close()

In [70]:
df_fraud_labels
# income, amount, age, statuscard, issue_yearcard, risk_score, timestamp
# target = is_fraud

Unnamed: 0,transaction_id,is_fraud
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
9995,9996,0
9996,9997,0
9997,9998,0
9998,9999,0


In [80]:
# timestamp = pd.to_datetime(df_transactions["timestamp"]).dt.time.sort_values()
a = df_customers.merge(df_cards, on="customer_id")[["customer_id","card_id","income", "age", "issue_year", "status"]]
b = df_transactions.merge(a, on="card_id")[["amount", "timestamp"]]
c = df_merchants.merge(df_transactions, on="merchant_id").merge(df_fraud_labels, on="transaction_id")[["transaction_id","risk_score","is_fraud"]]

df_for_predicts = pd.concat([a,b,c], axis=1)
df_for_predicts.sort_values(by="timestamp")

Unnamed: 0,customer_id,card_id,income,age,issue_year,status,amount,timestamp,transaction_id,risk_score,is_fraud
7837,,,,,,,421.87,2022-01-01 02:28:34,2657,0.51,0
8070,,,,,,,2.64,2022-01-01 02:38:39,3593,0.01,1
1687,1547.0,1830.0,50093.0,60.0,2015.0,Active,63.58,2022-01-01 03:37:19,7422,0.54,0
3217,,,,,,,21.44,2022-01-01 04:58:04,2711,0.08,0
116,114.0,1232.0,72455.0,36.0,2016.0,Expired,194.76,2022-01-01 07:51:56,7158,0.21,0
...,...,...,...,...,...,...,...,...,...,...,...
7631,,,,,,,43.65,2023-12-31 15:28:38,9376,0.15,0
9168,,,,,,,73.01,2023-12-31 17:42:55,834,0.78,0
8931,,,,,,,63.63,2023-12-31 17:44:52,1492,0.16,0
8421,,,,,,,41.50,2023-12-31 21:36:10,5689,0.85,0


In [None]:
# @title
class fraud_analyzer():
  def __init__(self, * ,customers=None, transactions=None, cards=None, fraud_labels=None, merchants=None):
    self.customers = customers
    self.transactions = transactions
    self.cards = cards
    self.fraud_labels = fraud_labels
    self.merchants = merchants

  def Customers(self):
    fig, axes = plt.subplots(3,2, figsize=(16,8))
    fig.suptitle("Customers Analysis")

    # plot1
    axes[0,0].scatter(self.customers["age"], self.customers["income"], alpha=0.4)
    axes[0,0].set_xlabel("Age")
    axes[0,0].set_ylabel("Income")

    # plot2
    idcustomer_idtransaction_amount = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id", "transaction_id","amount"]]
    income_vs_amount = pd.merge(idcustomer_idtransaction_amount, self.customers, on="customer_id")[["income", "amount"]]

    axes[0,1].scatter(income_vs_amount["income"], income_vs_amount["amount"], alpha=0.2)
    axes[0,1].set_xlabel("Income")
    axes[0,1].set_ylabel("Transaction Amount")

    # plot3
    axes[1,0].hist(idcustomer_idtransaction_amount["customer_id"], bins=50)
    axes[1,0].set_xlabel("Customer ID")
    axes[1,0].set_ylabel("Transaction Frequency")

    # plot4
    data = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id","amount"]]
    data = data.groupby("customer_id").mean()

    axes[1,1].scatter(data.index, data["amount"], alpha=0.3)
    axes[1,1].set_xlabel("Customer ID")
    axes[1,1].set_ylabel("Average Transaction Amount")

    # plot5
    self.transactions["timestamp"] = pd.to_datetime(self.transactions["timestamp"])
    frequency_transaction_hour = self.transactions["timestamp"].dt.hour.sort_values()

    axes[2,0].hist(frequency_transaction_hour)
    axes[2,0].set_xlabel("Hour")
    axes[2,0].set_ylabel("Transaction Frequency")

    # plot6
    frequency_transaction_date = self.transactions["timestamp"].sort_values()
    frequency_transaction_date = self.transactions["timestamp"].dt.date
    frequency_transaction_date = pd.to_datetime(self.transactions["timestamp"])

    axes[2,1].hist(frequency_transaction_date, bins=80)
    axes[2,1].set_xlabel("Date")
    axes[2,1].set_ylabel("Transaction Frequency")

    fig.tight_layout()

  def transaction_customer_map(self):
    world_map = gpd.read_file("/content/ne_110m_admin_0_countries.shp")

    countries = self.customers["country"].value_counts().reset_index().rename(columns={"country":"ISO_A2"})
    world_map = pd.merge(world_map, countries, on="ISO_A2")

    world_map.plot(column="count", cmap="OrRd", legend=True, color="lightblue", figsize=(20,8))

  def fraud_distribution(self):
    data = self.fraud_labels["is_fraud"].value_counts()
    data.plot(kind="bar")
    frauds_total = data.iloc[data.index == 1].item()
    fraud_porcent = (frauds_total * 100) / len(self.fraud_labels)

    print(data)
    print(f"\nFraud Porcent: {fraud_porcent}")

  class check_customers_cards():
    def __init__(self, cards):
      fraud_analyzer.cards

    def status():
      data = fraud_analyzer.cards["status"].value_counts().plot(kind="bar")
      plt.xlabel("")
      plt.ylabel("Number of Customers")

    def card_types():
      data = fraud_analyzer.cards["card_type"].value_counts().plot(kind="bar")
      plt.ylabel("Number of Customers")

    def seeker():

      while True:

        prompt = int(input("Enter the client id or 0 to exit: "))
        print()

        if prompt == 0:
          break

        elif prompt in fraud_analyzer.cards["customer_id"].unique():
          print(fraud_analyzer.cards[fraud_analyzer.cards["customer_id"] == prompt])
          print("\n\n")

        else:
          print("Customer not found")
          print("\n\n")

  class Merchants():
    def __init__(self, merchants):
      fraud_analyzer.merchants

    def sold_categories_merchants():
      fraud_analyzer.merchants["category"].value_counts().plot(kind="bar")

    def most_dangerous_merchants():
      fraud_analyzer.merchants["risk_score"].plot(kind="hist") # risk_score indicate how many historical frauds has every merchant
      plt.xlabel("Risk Score")
      plt.title("Distribution Risks Scores by Merchants")

      print("\nTop 10 most dangerous merchants:")
      print(fraud_analyzer.merchants.sort_values(by="risk_score", ascending=False).head(10))

fraud_analyzer = fraud_analyzer(customers=df_customers, transactions=df_transactions, cards=df_cards, fraud_labels=df_fraud_labels, merchants=df_merchants)

In [28]:
class backend_models():
  def __init__(self, data, model, metrics):
    self.data = data
    self.model = model
    self.metrics = metrics

  def data_upload(self):
    self.X = self.data.drop("target", axis=1)
    self.y = self.data["target"]

  def data_split(self):
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

  def model_train(self):
    model = self.model()
    model.fit(self.X_train, self.y_train)
    self.model = model

  def model_predict(self):
    self.prediction = self.model.predict(self.X_test)

  def model_evaluate(self):
    for metric in self.metrics:
      print(f"{metric}: {metric(self.y_test, self.prediction)}")

  def ask_tuning(self):
    print("1. Yes\n2.No\n")
    choice = dict(input("It's necessary tuning?: "))

    if choice == "1":
      params = input("Enter de params like a dict: ")
      grid = GridSearchCV(self.model, params, cv=5)
      grid.fit(self.X_train, self.y_train)
      best_model = grid.best_estimator_
      print(f"best params: {grid.best_params_}\n")
      print(f"best score: {grid.best_score_}\n")

    else:
      pass

  def predictions(self):
    X_new = input("Enter the new data: ")
    prediction = self.model.predict(X_new)
    return prediction

In [29]:
class models(backend_models):
  def __init__(self, *, data, model, metrics):
    super().__init__(data, model, metrics)

  def fraud_detect(self):
    self.data_upload()
    self.data_split()
    self.model_train()
    self.model_predict()
    self.model_evaluate()
    self.ask_tuning()

    return self.predictions()

  def comparision_trees_models():
    pass

  def neural_networks():
    pass

models = models(data=df_transactions, model=LogisticRegression, metrics=[precision_score, recall_score, f1_score])

In [30]:
models.fraud_detect()

KeyboardInterrupt: Interrupted by user

In [None]:
class fu():
  def __init__(self, lista1, lista2):
    self.lista1 = lista1
    self.lista2 = lista2
    self.resultado = []

  def tengan_valores_las_listas(self):
      return len(self.lista1) != 0 and len(self.lista2) != 0

  def buscar_numeros_mas_bajos(self):
      return min(self.lista1), min(self.lista2)

  def guardarlos_a_una_lista_nueva(self):
      bajo1, bajo2 = self.buscar_numeros_mas_bajos()
      self.resultado.append(bajo1)
      self.resultado.append(bajo2)

  def eliminar_esos_numeros_mas_bajos(self):
      bajo1, bajo2 = self.buscar_numeros_mas_bajos()
      self.lista1.remove(bajo1)
      self.lista2.remove(bajo2)

In [None]:
class solucion(fu):
  def __init__(self, lista1, lista2):
    super().__init__(lista1, lista2)

  def combinar_listas_menor_mayor(self):
    while self.tengan_valores_las_listas():
      self.buscar_numeros_mas_bajos()
      self.guardarlos_a_una_lista_nueva()
      self.eliminar_esos_numeros_mas_bajos()

    return self.resultado

solucion = solucion([1,2,4], [1,3,4]).combinar_listas_menor_mayor()

In [None]:
solucion

[1, 1, 2, 3, 4, 4]