In [2]:
import pandas as pd
#import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import sklearn.metrics as metricas
from sklearn.model_selection import cross_val_score

## Datos bien desbalanceados

In [3]:
### Obtenemos información de una base de datos de transacciones. 
### La variable Class determina si la transacción es fraudulenta 
### (1) o no (0) 

datos = pd.read_csv('creditcard.csv')
datos

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [4]:
### Contamos cuantos datos hay de cada clase. 

counts = datos['Class'].value_counts()
print(counts)
print("pocentaje de fraudes: " + str(counts[1]/(counts[1]+counts[0])))

Class
0    284315
1       492
Name: count, dtype: int64
pocentaje de fraudes: 0.001727485630620034


In [7]:
tain_attributes = [col for col in datos.columns if col != 'Class']

X_train, X_test, y_train, y_test = train_test_split(datos[tain_attributes], datos["Class"], test_size=0.25, random_state=42)

In [5]:
rf = RandomForestClassifier()
rf = rf.fit(X_train,y_train)

pred = rf.predict(X_test)
print(metricas.accuracy_score(y_test, pred ))
print(metricas.precision_score(y_test, pred ))
print(metricas.recall_score(y_test, pred ))

0.9995786635206876
0.9368421052631579
0.7876106194690266


In [6]:
### Para despeja posibles dudas acerca de un 
### set de test desbalanceado, usamos cross validation. 

score_rf = cross_val_score(RandomForestClassifier(n_estimators=100, random_state=13), datos[tain_attributes], datos["Class"], scoring='recall')
print("Cross Validation Recall: {}".format(score_rf))
print("Average Cross Validation Recall: {}".format(score_rf.mean()))


Cross Validation Recall: [0.98989899 0.78787879 0.68367347 0.83673469 0.66326531]
Average Cross Validation Recall: 0.7922902494331066


Vamos a intentar usar SMOTE (Synthetic Minority Over-sampling Technique). Como una forma de balancear un poco mejor. 
Usamos la librería Imbalanced Learn como un complemento a sklearn. 

In [5]:
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE, NearMiss


smote_pipeline = make_pipeline(SMOTE(random_state=42), 
                              RandomForestClassifier(n_estimators=100, random_state=13))

In [8]:
score_smote = cross_val_score(smote_pipeline, datos[tain_attributes], datos["Class"], scoring='recall')
print("Cross Validation Recall Scores: {}".format(score_smote))
print("Average Cross Validation Recall: {}".format(score_smote.mean()))

Cross Validation Recall Scores: [0.86868687 0.83838384 0.73469388 0.78571429 0.64285714]
Average Cross Validation Recall: 0.7740672026386312


In [None]:
NM_pipeline = make_pipeline(NearMiss(version=3), 
                              RandomForestClassifier(n_estimators=100, random_state=13))

In [None]:
score_NM = cross_val_score(NM_pipeline, datos[tain_attributes], datos["Class"], scoring='recall')
print("Cross Validation Recall Scores: {}".format(score_NM))
print("Average Cross Validation Recall: {}".format(score_NM.mean()))

## Actividad sugerida

Abajo vas a ver tabajo con otro set de datos, que no está tan desbalanceado. Pero lo que puedes hacer es borrar algunos ejemplos con fraude, para desbalancear más el set de datos y hacer que sea importante alguna acción correctiva. 

Realiza esa acción, y comprueba para distintos conjuntos de datos borrados (digamos, borrando el 90% de los fraudes, y el 99% de los fraudes) si SMOTE logra un mejor rendimiento, o no.

In [8]:
### nuevos datos. La clase relevante ahora es 'fraud' 

datos = pd.read_csv('card_transdata.csv')

In [9]:
### como se ve, esto no está tan desbalanceado
counts = datos['fraud'].value_counts()
print(counts)
print("pocentaje de fraudes: " + str(counts[1.0]/(counts[1.0]+counts[0.0])))

fraud
0.0    912597
1.0     87403
Name: count, dtype: int64
pocentaje de fraudes: 0.087403


In [None]:
### Código que remueve el 90% de los datos clasificados como fraude
fraudes = datos[datos['fraud'] == 1.0]
filas_a_remover = fraudes.sample(frac=0.9, random_state=42)
datos_reduced = datos.drop(filas_a_remover.index)

In [None]:
#### sigue tu actividad a partir de acá. 