# Design Pattern 10: Rebalancing

> Fornece abordagens para lidar com conjuntos de dados desbalanceados.

### Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')

### Carregando base de dados sintéticos sem classe neutra

In [2]:
df = pd.read_csv('data/creditcard.csv')
print(df.shape)
df.head()

(5492, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,fraud
1,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,fraud
2,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,fraud
3,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,fraud
4,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,fraud


### Balanceamento

In [3]:
df['Class'].value_counts()

not fraud    5000
fraud         492
Name: Class, dtype: int64

### DOWNSAMPLING

In [4]:
majority = df[df['Class'] == 'not fraud']
minority = df[df['Class'] == 'fraud']

majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)

new_df = pd.concat([minority, majority_downsampled])
new_df['Class'].value_counts()

fraud        492
not fraud    492
Name: Class, dtype: int64

### WEIGHTED CLASSES

In [5]:
X = df.drop(['Class'], axis=1)
y = df['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Definir os pesos das classes (inversamente proporcionais à frequência)
weights = {'not fraud': 1, 'fraud': 9}


clf = RandomForestClassifier(class_weight=weights, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       fraud       0.99      0.85      0.91       118
   not fraud       0.98      1.00      0.99       981

    accuracy                           0.98      1099
   macro avg       0.99      0.92      0.95      1099
weighted avg       0.98      0.98      0.98      1099



### UPSAMPLING

In [6]:
majority = df[df['Class'] == 'not fraud']
minority = df[df['Class'] == 'fraud']

minority_downsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)

new_df = pd.concat([majority, minority_downsampled])
new_df['Class'].value_counts()

not fraud    5000
fraud        5000
Name: Class, dtype: int64