<a href="https://colab.research.google.com/github/MatiasSiles/MastercardSalesOptimization/blob/main/Fraud_Detection_Transactions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Modelo de detección temprana de anomalías y fraudes financieros multivariado con series temporales, usando Deep Learning (LSTM/Transformer) y métodos bayesianos para estimar riesgo dinámico en transacciones Mastercard a nivel global.

Detección en tiempo real de fraudes invisibles de bajo monto, usando anomalías multivariada

In [145]:
import sqlite3
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

In [146]:
connection_db = sqlite3.connect("mastercard.db")

df_customers = pd.read_sql("SELECT * FROM customers", connection_db)
df_cards = pd.read_sql("SELECT * FROM cards", connection_db)
df_merchants = pd.read_sql("SELECT * FROM merchants", connection_db)
df_transactions = pd.read_sql("SELECT * FROM transactions", connection_db)
df_fraud_labels = pd.read_sql("SELECT * FROM fraud_labels", connection_db)

connection_db.close()

In [None]:
# @title
# data_for_predict_customers = df_customers.merge(df_cards, on="customer_id")[["customer_id","card_id","income", "age", "issue_year", "status"]]
# data_for_predict_customers = data_for_predict_customers.merge(df_transactions, on="card_id")[["customer_id","card_id","merchant_id","transaction_id","income", "age", "issue_year", "status","amount", "timestamp"]]
# data_for_predict_customers = data_for_predict_customers.merge(df_merchants, on="merchant_id")[["customer_id","card_id","merchant_id","transaction_id","income", "age", "issue_year", "status","amount", "timestamp","risk_score"]]
# data_for_predict_customers = data_for_predict_customers.merge(df_fraud_labels, on="transaction_id")[["customer_id","card_id","merchant_id","transaction_id","income", "age", "issue_year", "status","amount", "timestamp","risk_score","is_fraud"]]
# data_for_predict_customers["timestamp"] = pd.to_datetime(df_transactions["timestamp"]).dt.time
# data_for_predict_customers.rename(columns={"is_fraud":"target"}, inplace=True)
# data_for_predict_customers

In [147]:
customers_summary = pd.read_csv("/content/data_for_predict_customers.csv").groupby(
    ["customer_id",	"card_id", "income", "age", "issue_year", "status", "target"])[["merchant_id", "transaction_id", "amount", "timestamp",	"risk_score"]].count()

customers_summary # it's for know the customers data with frauds, useful after to do predicts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,merchant_id,transaction_id,amount,timestamp,risk_score
customer_id,card_id,income,age,issue_year,status,target,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1389,59954,18,2017,Blocked,0,6,6,6,6,6
2,1585,70523,21,2016,Active,0,3,3,3,3,3
3,1762,49907,65,2019,Expired,0,6,6,6,6,6
4,76,76241,49,2016,Active,0,3,3,3,3,3
4,79,76241,49,2015,Active,0,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...
1994,965,67012,33,2021,Blocked,0,7,7,7,7,7
1996,1573,71604,52,2021,Active,0,4,4,4,4,4
1997,567,61221,70,2021,Active,0,4,4,4,4,4
1998,940,39509,62,2017,Active,0,6,6,6,6,6


In [177]:
data_for_predict_customers = pd.read_csv("/content/data_for_predict_customers.csv")
data_for_predict_customers = data_for_predict_customers.drop(["customer_id", "card_id", "merchant_id", "transaction_id"], axis=1)
status_new = pd.get_dummies(data_for_predict_customers["status"])
data_for_predict_customers = pd.concat([data_for_predict_customers, status_new], axis=1).drop("status", axis=1)
data_for_predict_customers[["Active","Blocked","Expired"]] = data_for_predict_customers[["Active","Blocked","Expired"]].astype(int)
data_for_predict_customers["timestamp"] = data_for_predict_customers["timestamp"].as_type(float)

data_for_predict_customers

KeyError: "['customer_id', 'card_id', 'merchant_id', 'transaction_id'] not found in axis"

In [176]:
a = data_for_predict_customers.to_csv("data_for_predict_customers.csv", index=False)

In [None]:
a["timestamp"] = a["timestamp"].as_type(float)

In [None]:
# @title
class fraud_analyzer():
  def __init__(self, * ,customers=None, transactions=None, cards=None, fraud_labels=None, merchants=None):
    self.customers = customers
    self.transactions = transactions
    self.cards = cards
    self.fraud_labels = fraud_labels
    self.merchants = merchants

  def Customers(self):
    fig, axes = plt.subplots(3,2, figsize=(16,8))
    fig.suptitle("Customers Analysis")

    # plot1
    axes[0,0].scatter(self.customers["age"], self.customers["income"], alpha=0.4)
    axes[0,0].set_xlabel("Age")
    axes[0,0].set_ylabel("Income")

    # plot2
    idcustomer_idtransaction_amount = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id", "transaction_id","amount"]]
    income_vs_amount = pd.merge(idcustomer_idtransaction_amount, self.customers, on="customer_id")[["income", "amount"]]

    axes[0,1].scatter(income_vs_amount["income"], income_vs_amount["amount"], alpha=0.2)
    axes[0,1].set_xlabel("Income")
    axes[0,1].set_ylabel("Transaction Amount")

    # plot3
    axes[1,0].hist(idcustomer_idtransaction_amount["customer_id"], bins=50)
    axes[1,0].set_xlabel("Customer ID")
    axes[1,0].set_ylabel("Transaction Frequency")

    # plot4
    data = pd.merge(self.cards, self.transactions, on="card_id")[["customer_id","amount"]]
    data = data.groupby("customer_id").mean()

    axes[1,1].scatter(data.index, data["amount"], alpha=0.3)
    axes[1,1].set_xlabel("Customer ID")
    axes[1,1].set_ylabel("Average Transaction Amount")

    # plot5
    self.transactions["timestamp"] = pd.to_datetime(self.transactions["timestamp"])
    frequency_transaction_hour = self.transactions["timestamp"].dt.hour.sort_values()

    axes[2,0].hist(frequency_transaction_hour)
    axes[2,0].set_xlabel("Hour")
    axes[2,0].set_ylabel("Transaction Frequency")

    # plot6
    frequency_transaction_date = self.transactions["timestamp"].sort_values()
    frequency_transaction_date = self.transactions["timestamp"].dt.date
    frequency_transaction_date = pd.to_datetime(self.transactions["timestamp"])

    axes[2,1].hist(frequency_transaction_date, bins=80)
    axes[2,1].set_xlabel("Date")
    axes[2,1].set_ylabel("Transaction Frequency")

    fig.tight_layout()

  def transaction_customer_map(self):
    world_map = gpd.read_file("/content/ne_110m_admin_0_countries.shp")

    countries = self.customers["country"].value_counts().reset_index().rename(columns={"country":"ISO_A2"})
    world_map = pd.merge(world_map, countries, on="ISO_A2")

    world_map.plot(column="count", cmap="OrRd", legend=True, color="lightblue", figsize=(20,8))

  def fraud_distribution(self):
    data = self.fraud_labels["is_fraud"].value_counts()
    data.plot(kind="bar")
    frauds_total = data.iloc[data.index == 1].item()
    fraud_porcent = (frauds_total * 100) / len(self.fraud_labels)

    print(data)
    print(f"\nFraud Porcent: {fraud_porcent}")

  class check_customers_cards():
    def __init__(self, cards):
      fraud_analyzer.cards

    def status():
      data = fraud_analyzer.cards["status"].value_counts().plot(kind="bar")
      plt.xlabel("")
      plt.ylabel("Number of Customers")

    def card_types():
      data = fraud_analyzer.cards["card_type"].value_counts().plot(kind="bar")
      plt.ylabel("Number of Customers")

    def seeker():

      while True:

        prompt = int(input("Enter the client id or 0 to exit: "))
        print()

        if prompt == 0:
          break

        elif prompt in fraud_analyzer.cards["customer_id"].unique():
          print(fraud_analyzer.cards[fraud_analyzer.cards["customer_id"] == prompt])
          print("\n\n")

        else:
          print("Customer not found")
          print("\n\n")

  class Merchants():
    def __init__(self, merchants):
      fraud_analyzer.merchants

    def sold_categories_merchants():
      fraud_analyzer.merchants["category"].value_counts().plot(kind="bar")

    def most_dangerous_merchants():
      fraud_analyzer.merchants["risk_score"].plot(kind="hist") # risk_score indicate how many historical frauds has every merchant
      plt.xlabel("Risk Score")
      plt.title("Distribution Risks Scores by Merchants")

      print("\nTop 10 most dangerous merchants:")
      print(fraud_analyzer.merchants.sort_values(by="risk_score", ascending=False).head(10))

fraud_analyzer = fraud_analyzer(customers=df_customers, transactions=df_transactions, cards=df_cards, fraud_labels=df_fraud_labels, merchants=df_merchants)

In [149]:
class backend_models():
  def __init__(self, data, model, metrics):
    self.data = data
    self.model = model
    self.metrics = metrics

  def data_upload(self):
    # self.data = pd.read_csv(self.data)
    self.X = self.data.drop("target", axis=1)
    self.y = self.data["target"]

  def data_split(self):
    self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)

  def model_train(self):
    model = self.model()
    model.fit(self.X_train, self.y_train)
    self.model = model

  def model_predict(self):
    self.prediction = self.model.predict(self.X_test)

  def model_evaluate(self):
    for metric in self.metrics:
      print(f"{metric}: {metric(self.y_test, self.prediction)}")

  def ask_tuning(self):
    print("1. Yes\n2.No\n")
    choice = dict(input("It's necessary tuning?: "))

    if choice == "1":
      params = input("Enter de params like a dict: ")
      grid = GridSearchCV(self.model, params, cv=5)
      grid.fit(self.X_train, self.y_train)
      best_model = grid.best_estimator_
      print(f"best params: {grid.best_params_}\n")
      print(f"best score: {grid.best_score_}\n")

    else:
      pass

  def predictions(self):
    X_new = input("Enter the new data: ")
    prediction = self.model.predict(X_new)
    return prediction

In [173]:
class models(backend_models):
  def __init__(self, *, data, model, metrics):
    super().__init__(data, model, metrics)

  def fraud_detect(self):
    self.data_upload()
    self.data_split()
    self.model_train()
    self.model_predict()
    self.model_evaluate()
    self.ask_tuning()

    return self.predictions()

  def comparision_trees_models():
    pass

  def neural_networks():
    pass

models = models(data="/content/data_for_predict_customers.csv", model=LogisticRegression, metrics=[precision_score, recall_score, f1_score])

In [174]:
models.fraud_detect()

ValueError: could not convert string to float: '03:13:57'