<a href="https://colab.research.google.com/github/MaxiPerrone/fraud-detection-ml/blob/main/Deteccion_fraude_regresion_logistica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import pandas as pd
import kagglehub
import os

In [None]:
dataset_path = kagglehub.dataset_download("dhanushnarayananr/credit-card-fraud")
csv_file = os.path.join(dataset_path, "card_transdata.csv")

Using Colab cache for faster access to the 'credit-card-fraud' dataset.


In [None]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
  strat = df[stratify] if stratify else None
  train_set, test_set = train_test_split(df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
  strat = test_set[stratify] if stratify else None
  val_set, test_set = train_test_split(test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
  return (train_set, val_set, test_set)

In [None]:
df = pd.read_csv(csv_file)
df.head(5)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [None]:
X = df.drop("fraud", axis=1)
y = df["fraud"]

In [None]:
train_set, val_set, test_set = train_val_test_split(df)

X_train, y_train = train_set.drop("fraud", axis=1), train_set["fraud"]
X_val, y_val = val_set.drop("fraud", axis=1), val_set["fraud"]
X_test, y_test = test_set.drop("fraud", axis=1), test_set["fraud"]

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")
lr.fit(X_train_scaled, y_train)

print("Coheficientes: ")
print(lr.coef_)
print("Termino independiente: ")
print(lr.intercept_)

Coheficientes: 
[[ 1.89248094  1.16254941  3.42410502 -0.46990862 -0.56588303 -3.1223267
   2.42791447]]
Termino independiente: 
[-3.9051979]


In [None]:
sample = X.sample(1, random_state=7)
sample_scaled = scaler.transform(sample)


In [None]:
sample_prediction = lr.predict(sample_scaled)[0]
y_real = y.loc[sample.index].values[0]

print("Real result:", y_real)
print("Prediction:", sample_prediction)

Real result: 0.0
Prediction: 0.0


In [None]:
def evaluate(name, X_scaled, y_true):
  y_pred = lr.predict(X_scaled)
  y_proba = lr.predict_proba(X_scaled)[:, 1]

  print(f"\n== {name} ==")
  print("Accuracy :", accuracy_score(y_true, y_pred))
  print("Precision:", precision_score(y_true, y_pred, zero_division=0))
  print("Recall   :", recall_score(y_true, y_pred))
  print("F1       :", f1_score(y_true, y_pred))
  print("ROC-AUC  :", roc_auc_score(y_true, y_proba))

In [None]:
evaluate("VALIDATION", X_val_scaled, y_val)
evaluate("TEST", X_test_scaled, y_test)


== VALIDATION ==
Accuracy : 0.934235
Precision: 0.5746640808976312
Recall   : 0.9501288290867449
F1       : 0.7161692669558275
ROC-AUC  : 0.9788470479183914

== TEST ==
Accuracy : 0.934235
Precision: 0.5719306792873051
Recall   : 0.9507144096720078
F1       : 0.7142081133346371
ROC-AUC  : 0.9791346842550714
