# Clasificación Voting Classifier

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [15]:
data_path = './data/'

train_data = pd.read_csv(f'{data_path}train_data.csv')
test_data = pd.read_csv(f'{data_path}test_data.csv')

df_reduce_mrmr = pd.read_csv(f'{data_path}X_train_reduce_mrmr.csv')
df_reduce_mrmr_instances = pd.read_csv(f'{data_path}df_reduce_mrmr_instances.csv')
df_reduce_mrmr_instances_hard = pd.read_csv(f'{data_path}df_reduce_mrmr_instances_hard.csv')
df_reduce_mrmr_instances_GLVQ = pd.read_csv(f'{data_path}df_reduce_mrmr_instances_GLVQ.csv')

df_X_train_reduce_RFC = pd.read_csv(f'{data_path}df_X_train_reduce_RFC.csv')
df_reduce_RFC_instances = pd.read_csv(f'{data_path}df_reduce_RFC_instances.csv')
df_reduce_RFC_instances_hard = pd.read_csv(f'{data_path}df_reduce_RFC_instances_hard.csv')
df_reduce_RFC_instances_GLVQ = pd.read_csv(f'{data_path}df_reduce_RFC_instances_GLVQ.csv')

print("Datos cargados exitosamente:")
print(f"train_data: {train_data.shape}")
print(f"df_reduce_mrmr: {df_reduce_mrmr.shape}")
print(f"df_reduce_mrmr_instances: {df_reduce_mrmr_instances.shape}")
print(f"df_reduce_mrmr_instances hard: {df_reduce_mrmr_instances_hard.shape}")
print(f"df_reduce_mrmr_instances_GLVQ: {df_reduce_mrmr_instances_GLVQ.shape}")
print(f"df_X_train_reduce_RFC: {df_X_train_reduce_RFC.shape}")
print(f"df_reduce_RFC_instances: {df_reduce_RFC_instances.shape}")
print(f"df_reduce_RFC_instances hard: {df_reduce_RFC_instances_hard.shape}")
print(f"df_reduce_RFC_instances_GLVQ: {df_reduce_RFC_instances_GLVQ.shape}")

Datos cargados exitosamente:
train_data: (256326, 31)
df_reduce_mrmr: (256326, 11)
df_reduce_mrmr_instances: (886, 11)
df_reduce_mrmr_instances hard: (886, 11)
df_reduce_mrmr_instances_GLVQ: (2, 11)
df_X_train_reduce_RFC: (256326, 11)
df_reduce_RFC_instances: (886, 11)
df_reduce_RFC_instances hard: (886, 11)
df_reduce_RFC_instances_GLVQ: (2, 11)


In [16]:
def train_and_evaluate_voting(X, y, test_data, columns_to_keep):
    # División de los datos en conjunto de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Dimensiones de los conjuntos:")
    print(f"Conjunto de entrenamiento: {X_train.shape}, {y_train.shape}")
    print(f"Conjunto de prueba: {X_test.shape}, {y_test.shape}")

    # Creación de los modelos base para el ensemble
    clf1 = GaussianNB()
    clf2 = RandomForestClassifier(random_state=42)
    clf3 = SVC(probability=True, random_state=42)

    # Creación del clasificador Voting
    voting_clf = VotingClassifier(estimators=[('nb', clf1), ('rf', clf2), ('svc', clf3)], voting='soft')

    # Ajuste de hiperparámetros con GridSearchCV
    param_grid = {
        'rf__n_estimators': [50, 100, 200],
        'svc__C': [0.1, 1, 10],
        'svc__kernel': ['linear', 'rbf'],
    }

    grid_search = GridSearchCV(estimator=voting_clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Mejores parámetros encontrados
    print("Mejores parámetros encontrados: ", grid_search.best_params_)

    # Evaluación en el conjunto de prueba
    best_model = grid_search.best_estimator_
    accuracy = best_model.score(X_test, y_test)
    print(f"Precisión en el conjunto de prueba: {accuracy:.2f}")

    # Preparar los datos de test final
    X_test_final = test_data[columns_to_keep]
    y_test_final = test_data['Class']

    scaler = MinMaxScaler()
    for col in ['Amount', 'Time']:
        if col in X_test_final.columns:
            X_test_final[col] = scaler.fit_transform(X_test_final[[col]])

    # Predicciones en el conjunto de test final
    y_pred = best_model.predict(X_test_final)

    # Matriz de confusión y reporte de clasificación
    conf_matrix = confusion_matrix(y_test_final, y_pred)
    report = classification_report(y_test_final, y_pred, target_names=['Correctas', 'Fraudulentas'])

    print("Matriz de confusión:")
    print(conf_matrix)
    print("\nReporte de Clasificación:")
    print(report)

In [17]:
X = df_reduce_mrmr_instances.drop(columns=['Class'])
y = df_reduce_mrmr_instances['Class']
columns_to_keep_mrmr = ['V17', 'Time', 'Amount', 'V25', 'V20', 'V7', 'V13', 'V22', 'V19', 'V23']

print("\n--- Evaluación con mRMR ClusterCentroids_soft (Voting Ensemble) ---")
train_and_evaluate_voting(X, y, test_data, columns_to_keep_mrmr)


--- Evaluación con mRMR ClusterCentroids_soft (Voting Ensemble) ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (708, 10), (708,)
Conjunto de prueba: (178, 10), (178,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Mejores parámetros encontrados:  {'rf__n_estimators': 50, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Precisión en el conjunto de prueba: 0.94


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])


Matriz de confusión:
[[13227 15205]
 [    2    47]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      0.47      0.63     28432
Fraudulentas       0.00      0.96      0.01        49

    accuracy                           0.47     28481
   macro avg       0.50      0.71      0.32     28481
weighted avg       1.00      0.47      0.63     28481



In [18]:
X = df_reduce_RFC_instances.drop(columns=['Class'])
y = df_reduce_RFC_instances['Class']
columns_to_keep_RFC = ['V17', 'V16', 'V12', 'V14', 'V11', 'V10', 'V9', 'V4', 'V18', 'V7']

print("\n--- Evaluación con RFC ClusterCentroids_soft (Voting Ensemble) ---")
train_and_evaluate_voting(X, y, test_data, columns_to_keep_RFC)


--- Evaluación con RFC ClusterCentroids_soft (Voting Ensemble) ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (708, 10), (708,)
Conjunto de prueba: (178, 10), (178,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Mejores parámetros encontrados:  {'rf__n_estimators': 50, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Precisión en el conjunto de prueba: 0.90
Matriz de confusión:
[[28394    38]
 [    6    43]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      1.00      1.00     28432
Fraudulentas       0.53      0.88      0.66        49

    accuracy                           1.00     28481
   macro avg       0.77      0.94      0.83     28481
weighted avg       1.00      1.00      1.00     28481



In [20]:
X = df_reduce_mrmr_instances_GLVQ.drop(columns=['Class'])
y = df_reduce_mrmr_instances_GLVQ['Class']
columns_to_keep_mrmr = ['V17', 'Time', 'Amount', 'V25', 'V20', 'V7', 'V13', 'V22', 'V19', 'V23']

print("\n--- Evaluación con mRMR GLVQ (Voting Ensemble) ---")
train_and_evaluate_voting(X, y, test_data, columns_to_keep_mrmr)


--- Evaluación con mRMR GLVQ (Voting Ensemble) ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (1, 10), (1,)
Conjunto de prueba: (1, 10), (1,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits


ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=1.

In [7]:
X = df_reduce_RFC_instances_GLVQ.drop(columns=['Class'])
y = df_reduce_RFC_instances_GLVQ['Class']

print("\n--- Evaluación con RFC GLVQ (Voting Ensemble) ---")
train_and_evaluate_voting(X, y, test_data, columns_to_keep_RFC)

KeyError: "['Class'] not found in axis"

In [21]:
def evaluate_all_datasets():
    datasets = [
        (df_reduce_mrmr_instances, ['V17', 'Time', 'Amount', 'V25', 'V20', 'V7', 'V13', 'V22', 'V19', 'V23'], "mRMR ClusterCentroids_soft"),
        (df_reduce_RFC_instances, ['V17', 'V16', 'V12', 'V14', 'V11', 'V10', 'V9', 'V4', 'V18', 'V7'], "RFC ClusterCentroids_soft"),
        (df_reduce_mrmr_instances_hard, ['V17', 'Time', 'Amount', 'V25', 'V20', 'V7', 'V13', 'V22', 'V19', 'V23'], "mRMR ClusterCentroids_hard"),
        (df_reduce_RFC_instances_hard, ['V17', 'V16', 'V12', 'V14', 'V11', 'V10', 'V9', 'V4', 'V18', 'V7'], "RFC ClusterCentroids_hard")#,
        #(df_reduce_mrmr_instances_GLVQ, ['V17', 'Time', 'Amount', 'V25', 'V20', 'V7', 'V13', 'V22', 'V19', 'V23'], "mRMR GLVQ"),
        #(df_reduce_RFC_instances_GLVQ, ['V17', 'V16', 'V12', 'V14', 'V11', 'V10', 'V9', 'V4', 'V18', 'V7'], "RFC GLVQ")
    ]

    for dataset, columns_to_keep, name in datasets:
        print(f"\n--- Evaluación con {name} ---")
        X = dataset.drop(columns=['Class'])
        y = dataset['Class']
        train_and_evaluate_voting(X, y, test_data, columns_to_keep)

# Ejecutar la evaluación
evaluate_all_datasets()


--- Evaluación con mRMR ClusterCentroids_soft ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (708, 10), (708,)
Conjunto de prueba: (178, 10), (178,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Mejores parámetros encontrados:  {'rf__n_estimators': 50, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Precisión en el conjunto de prueba: 0.94


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])


Matriz de confusión:
[[13227 15205]
 [    2    47]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      0.47      0.63     28432
Fraudulentas       0.00      0.96      0.01        49

    accuracy                           0.47     28481
   macro avg       0.50      0.71      0.32     28481
weighted avg       1.00      0.47      0.63     28481


--- Evaluación con RFC ClusterCentroids_soft ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (708, 10), (708,)
Conjunto de prueba: (178, 10), (178,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Mejores parámetros encontrados:  {'rf__n_estimators': 50, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Precisión en el conjunto de prueba: 0.90
Matriz de confusión:
[[28394    38]
 [    6    43]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      1.00      1.00     28432
Fraudulentas       0.53      0.88      0.66

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_final[col] = scaler.fit_transform(X_test_final[[col]])


Matriz de confusión:
[[26711  1721]
 [    5    44]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      0.94      0.97     28432
Fraudulentas       0.02      0.90      0.05        49

    accuracy                           0.94     28481
   macro avg       0.51      0.92      0.51     28481
weighted avg       1.00      0.94      0.97     28481


--- Evaluación con RFC ClusterCentroids_hard ---
Dimensiones de los conjuntos:
Conjunto de entrenamiento: (708, 10), (708,)
Conjunto de prueba: (178, 10), (178,)
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Mejores parámetros encontrados:  {'rf__n_estimators': 50, 'svc__C': 0.1, 'svc__kernel': 'linear'}
Precisión en el conjunto de prueba: 0.90
Matriz de confusión:
[[28387    45]
 [    6    43]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Correctas       1.00      1.00      1.00     28432
Fraudulentas       0.49      0.88      0.63