# INICIO:
Este proyecto se centra en la detección de fraudes en transacciones con tarjetas de crédito utilizando un modelo de regresión logística. Dado el desafío de un conjunto de datos desbalanceado, aplicaremos técnicas de preprocesamiento para mejorar la precisión del modelo en la identificación de transacciones fraudulentas.


In [None]:
# procesamiento
import pandas as pd
# escalan todos los datos
from sklearn.preprocessing import StandardScaler
#modelo
from sklearn.linear_model import LogisticRegression
# Separa los datos
from sklearn.model_selection import train_test_split
# metricas para evaluar el modelo
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
# visualizacion
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


In [None]:
# vemos la dimesionalidad
df.shape

(284807, 31)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
df.isnull().sum().sum()

0

In [None]:
# La clase que nos interesa es 'Class'
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

Mierda podemos ver que los datos estan desvalaceados
hay mas 0 que 1.
como lo solucionaremos nose a ver que se puede hacer pa

In [None]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Realizamos predicion
y_pred = model.predict(X_test)

In [None]:
#Verifico la matriz de Confusión
matriz_confusion = confusion_matrix(y_test, y_pred)
print(matriz_confusion)

[[85258    38]
 [   68    79]]


In [None]:
# Predicion del modelo
precision = precision_score(y_test, y_pred)
print('Precisión del modelo:')
print(precision)

Precisión del modelo:
0.6752136752136753


### Vemos que el modelo es impreciso.
### Una posible causa es que los datos están desbalaceados.

In [None]:
# Usamos la tecnica de balaceo solver="newton-g",class_weight="balaced"
model_2 = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced")

In [None]:
model_2.fit(X_train, y_train)



In [None]:
y2_pred = model_2.predict(X_test)

In [None]:
def run_model_balanced(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(C=1.0,penalty='l2',random_state=1,solver="newton-cg",class_weight="balanced")
    clf.fit(X_train, y_train)
    return clf

model_3 = run_model_balanced(X_train, X_test, y_train, y_test)
y3_pred = model.predict(X_test)



In [None]:
precision_3 = precision_score(y_test, y3_pred)
print('Precisión del modelo:')
print(precision_3)
exactitud_3 = accuracy_score(y_test, y3_pred)
print('Exactitud del modelo:')
print(exactitud_3)

Precisión del modelo:
0.07170224411603722
Exactitud del modelo:
0.9799632503540372


In [None]:
# cuarto modelo
df_copy = df.copy()
# para este vamos a hacer una muestra significativa de los datos de class = 0 osea no fraude

In [None]:
Fraud_transaction = df_copy[df_copy['Class']==1]
Normal_transaction = df_copy[df_copy['Class']==0]

In [None]:
Normal_transaction = Normal_transaction.sample(492)
print(Normal_transaction.shape)

(492, 31)


In [None]:
muestra = pd.concat((Normal_transaction, Fraud_transaction), axis=0)

In [None]:
muestra.shape

(984, 31)

In [None]:
X = muestra.drop('Class', axis=1)
y = muestra['Class']

In [None]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X, y, test_size=0.1, random_state=1, stratify=y)

In [None]:
model_4 = LogisticRegression()
model_4.fit(X_train_4, y_train_4)


In [None]:
y4_pred = model_4.fit(X_train_4, y_train_4).predict(X_test_4)

In [None]:
precision_4 = precision_score(y_test, y4_pred)
print('Precisión del modelo:')
print(precision_4)
exactitud_4 = accuracy_score(y_test, y4_pred)
print('Exactitud del modelo:')
print(exactitud_4)

Precisión del modelo:
0.9361702127659575
Exactitud del modelo:
0.9090909090909091


# Conclusión:
En resumen, la regresión logística, combinada con técnicas de manejo de datos desbalanceados, ha mostrado ser eficaz en la detección de fraudes.