# Importing libraries

In [68]:
import pandas as pd
import numpy as np

## Plot
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

## Stats
import scipy.stats as stats
from scipy.stats import pearsonr, spearmanr

## Feature selection
from sklearn.feature_selection import mutual_info_classif

## Preprocessing
from sklearn.preprocessing import scale

## Selection Models
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV, cross_val_score, cross_validate

## Models
from sklearn.ensemble import RandomForestClassifier

## Metrics
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score
from sklearn.metrics import classification_report, balanced_accuracy_score, make_scorer
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef

## Others
import warnings
# Hide warnings
warnings.filterwarnings('ignore')

from collections import Counter

## My functions
import sys
sys.path.append("../0. Scripts")
import data_analysing_functions as daf
import model_metrics_functions as mmf

# Loading Data

In [71]:
## Imputed dataset
imputed_train_df = pd.read_csv('../0. Data/3. Imputed/mean_median_imputed_train_df.csv')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Configuración para evitar la notación científica
pd.set_option('display.float_format', lambda x: '%.8f' % x)

In [73]:
target = imputed_train_df.liver_cancer
target.info()

<class 'pandas.core.series.Series'>
RangeIndex: 119495 entries, 0 to 119494
Series name: liver_cancer
Non-Null Count   Dtype
--------------   -----
119495 non-null  int64
dtypes: int64(1)
memory usage: 933.7 KB


# Getting most inportant variables

In [76]:
def calculate_feature_importance(df, target, method='mutual_info', remove_below_mean=False):
    # Crear una copia del dataset
    df_copy = df.copy()

    # Agregar dos variables de control (simulando las variables de control)
    df_copy['aleatorio'] = np.random.uniform(0, 1, size=df_copy.shape[0])
    df_copy['aleatorio2'] = np.random.uniform(0, 1, size=df_copy.shape[0])

    # Seleccionar variables predictoras
    X = df_copy.drop(columns=[target.name])

    # Dependiendo del método seleccionado, se calcula la importancia de las características
    if method == 'mutual_info':
        mi = mutual_info_classif(X, target)
        feature_importance = pd.Series(mi, index=X.columns).sort_values(ascending=False)
    
    elif method == 'pearson':
        corr = X.corrwith(target, method='pearson')
        feature_importance = corr.abs().sort_values(ascending=False)
    
    elif method == 'spearman':
        corr, _ = spearmanr(X, target)
        feature_importance = pd.Series(corr, index=X.columns).abs().sort_values(ascending=False)

    # Ordenar las variables por importancia
    sorted_importance = feature_importance.sort_values(ascending=False)

    # Encontrar la primera aparición de 'aleatorio' o 'aleatorio2'
    for idx, feature in enumerate(sorted_importance.index):
        if feature in ['aleatorio', 'aleatorio2']:
            # Eliminar todas las variables a partir de la primera aparición de 'aleatorio' o 'aleatorio2'
            sorted_importance = sorted_importance.iloc[:idx]
            break

    # Si se pide eliminar las variables por debajo de la media de importancia
    if remove_below_mean:
        # Calcular la media de la importancia
        mean_importance = sorted_importance.mean()
        # Filtrar las variables cuyo valor de importancia esté por debajo de la media
        sorted_importance = sorted_importance[sorted_importance >= mean_importance]
    
    # Crear un DataFrame con las variables y sus importancias
    importance_df = pd.DataFrame({
        'Feature': sorted_importance.index,
        'Importance': sorted_importance.values
    })

    return importance_df

In [78]:
# Obtén las variables más importantes según la métrica seleccionada (por defecto mutual_info)
importance_removing_below_mean_train_df = calculate_feature_importance(imputed_train_df, target=target, method='mutual_info', remove_below_mean=True)
importance_removing_below_mean_train_df

Unnamed: 0,Feature,Importance
0,liver_exitstat,0.03111009
1,race7,0.01872231
2,filtered_f,0.01843302
3,bmi_20c,0.01716336
4,in_TGWAS_population,0.01548829
5,sex,0.01493367
6,arm,0.01445193
7,mortality_exitstat,0.01235229
8,smokea_f,0.01119537
9,cig_stat,0.01114946


In [79]:
importance_train_df = calculate_feature_importance(imputed_train_df, target=target, method='mutual_info', remove_below_mean=False)
# Muestra las variables más importantes
importance_train_df

Unnamed: 0,Feature,Importance
0,liver_exitstat,0.0317977
1,filtered_f,0.01870666
2,race7,0.01857425
3,bmi_20c,0.01708894
4,in_TGWAS_population,0.01572653
5,arm,0.01415223
6,sex,0.01397268
7,mortality_exitstat,0.01213029
8,smokea_f,0.01101322
9,cig_stat,0.0110107


# Transforming variables

In [81]:
# creacion de 2 dataset uno eliminando del original las variable mas importantes encontradas en importance_removing_below_mean_train_df
# Crear un DataFrame donde se eliminen las variables encontradas en importance_removing_below_mean_train_df
df_copy1 = imputed_train_df.copy()

columns_to_remove_1 = importance_removing_below_mean_train_df['Feature'].tolist()
imputed_removing_below_mean_train_df = df_copy1[columns_to_remove_1]

# y otro eliminando del original las encontradas en importance_df
# Crear un DataFrame donde se eliminen las variables encontradas en importance_df
df_copy2 = imputed_train_df.copy()

columns_to_remove_2 = importance_train_df['Feature'].tolist()
imputed_removing_not_important_variables_train_df = df_copy2[columns_to_remove_2]

In [82]:
# Función para encontrar las mejores transformaciones para el conjunto de entrenamiento
def mejorTransf(df, target, tipo='mutual_info'):
    # Diccionario para almacenar las mejores transformaciones para cada columna
    transformations = {}

    # Recorrer todas las columnas del DataFrame
    for column in df.columns:
        if column != target.name:  # Excluir la columna objetivo (target)
            vv = df[column]  # Asignar la columna actual a 'vv'
            
            # Escalado de datos (evitar fallos de tamaño de float64 al hacer exp de número grande)
            vv = pd.Series(scale(vv), name=vv.name)
            # Traslación a valores positivos de la variable (sino falla log y las raíces!)
            vv = vv + abs(min(vv)) + 0.0001
            
            # Definir y calcular las transformaciones típicas  
            transf = pd.DataFrame({
                vv.name + '_ident': vv, 
                vv.name + '_log': np.log(vv), 
                vv.name + '_exp': np.exp(vv), 
                vv.name + '_sqrt': np.sqrt(vv), 
                vv.name + '_sqr': np.square(vv), 
                vv.name + '_cuarta': vv**4, 
                vv.name + '_raiz4': vv**(1/4)
            })
            
            # Evaluar la transformación según el tipo especificado
            if tipo == 'pearson':
                # Aplicar correlación de Pearson a cada transformación frente a la respuesta
                tablaCorr = pd.DataFrame(transf.apply(lambda x: pearsonr(x, target)[0]), columns=['Pearson'])
                best = tablaCorr.query('Pearson.abs() == Pearson.abs().max()').index[0]
                transformations[column] = best.split('_')[-1]  # Guardar solo la transformación (sin "_ident" ni demás)

            elif tipo == 'spearman':
                # Aplicar correlación de Spearman a cada transformación frente a la respuesta
                tablaCorr = pd.DataFrame(transf.apply(lambda x: spearmanr(x, target)[0]), columns=['Spearman'])
                best = tablaCorr.query('Spearman.abs() == Spearman.abs().max()').index[0]
                transformations[column] = best.split('_')[-1]  # Guardar solo la transformación (sin "_ident" ni demás)

            elif tipo == 'mutual_info':
                # Aplicar información mutua a cada transformación frente a la respuesta
                tablaMI = pd.DataFrame(transf.apply(lambda x: mutual_info_classif(x.values.reshape(-1, 1), target)[0]), columns=['Mutual Information'])
                best = tablaMI.query('`Mutual Information` == `Mutual Information`.max()').index[0]
                transformations[column] = best.split('_')[-1]  # Guardar solo la transformación (sin "_ident" ni demás)

    return transformations  # Devuelve el diccionario con las mejores transformaciones para cada variable


# Función para aplicar las transformaciones guardadas al conjunto de prueba
def apply_best_transformations(df, transformations, target):
    transformed_df = pd.DataFrame()  # DataFrame vacío para las transformaciones aplicadas

    # Recorrer todas las columnas del DataFrame
    for column in df.columns:
        if column != target.name:  # Excluir la columna objetivo (target)
            vv = df[column]  # Asignar la columna actual a 'vv'
            
            # Recuperar la transformación almacenada
            transformation = transformations[column]
            
            # Aplicar la transformación correspondiente
            if transformation == 'log':
                transformed_df[column + '_log'] = np.log(vv + 0.0001)
            elif transformation == 'sqrt':
                transformed_df[column + '_sqrt'] = np.sqrt(vv)
            elif transformation == 'exp':
                transformed_df[column + '_exp'] = np.exp(vv)
            elif transformation == 'sqr':
                transformed_df[column + '_sqr'] = np.square(vv)
            elif transformation == 'cuarta':
                transformed_df[column + '_cuarta'] = vv ** 4
            elif transformation == 'raiz4':
                transformed_df[column + '_raiz4'] = vv ** (1/4)

    return transformed_df

In [83]:
# Encontrar las mejores transformaciones para el conjunto de entrenamiento
transformations = mejorTransf(imputed_removing_below_mean_train_df, target, tipo='mutual_info')
transformations

{'liver_exitstat': 'sqrt',
 'race7': 'log',
 'filtered_f': 'log',
 'bmi_20c': 'raiz4',
 'in_TGWAS_population': 'ident',
 'sex': 'cuarta',
 'arm': 'sqr',
 'mortality_exitstat': 'log',
 'smokea_f': 'raiz4',
 'cig_stat': 'exp',
 'bmi_50c': 'raiz4',
 'ssmokea_f': 'sqrt',
 'bmi_curc': 'raiz4',
 'fh_cancer': 'ident',
 'preg_f': 'log',
 'agelevel': 'log'}

In [84]:
# Aplicar las transformaciones al conjunto de entrenamiento
transformed_removing_below_mean_train_df = apply_best_transformations(imputed_removing_below_mean_train_df, transformations, target)

In [85]:
transformed_removing_below_mean_train_df.head()

Unnamed: 0,liver_exitstat_sqrt,race7_log,filtered_f_log,bmi_20c_raiz4,sex_cuarta,arm_sqr,mortality_exitstat_log,smokea_f_raiz4,cig_stat_exp,bmi_50c_raiz4,ssmokea_f_sqrt,bmi_curc_raiz4,preg_f_log,agelevel_log
0,2.82842712,0.0001,0.0001,1.0,1,4,0.69319718,2.05976714,1.0,1.18920712,6.52143406,1.18920712,-9.21034037,0.0001
1,2.44948974,0.0001,0.0001,1.18920712,16,4,0.0001,2.21336384,2.71828183,1.31607401,6.52143406,1.41421356,0.0001,-9.21034037
2,2.82842712,0.0001,0.0001,1.18920712,16,1,1.09864562,2.05976714,1.0,1.41421356,6.52143406,1.41421356,0.0001,-9.21034037
3,2.23606798,0.0001,0.0001,1.18920712,1,1,0.0001,2.05976714,1.0,1.31607401,6.52143406,1.18920712,-9.21034037,0.69319718
4,2.82842712,0.0001,0.0001,1.18920712,16,1,0.0001,2.11474253,7.3890561,1.18920712,6.32455532,1.18920712,0.0001,1.09864562


In [86]:
# Encontrar las mejores transformaciones para el conjunto de entrenamiento
transformations2 = mejorTransf(imputed_removing_not_important_variables_train_df, target, tipo='mutual_info')
transformations2

{'liver_exitstat': 'sqrt',
 'filtered_f': 'log',
 'race7': 'log',
 'bmi_20c': 'raiz4',
 'in_TGWAS_population': 'ident',
 'arm': 'cuarta',
 'sex': 'cuarta',
 'mortality_exitstat': 'log',
 'smokea_f': 'raiz4',
 'cig_stat': 'log',
 'ssmokea_f': 'sqrt',
 'bmi_50c': 'raiz4',
 'bmi_curc': 'raiz4',
 'fh_cancer': 'ident',
 'preg_f': 'log',
 'agelevel': 'log',
 'menstrs': 'log',
 'fmenstr': 'log',
 'sisters': 'log',
 'brothers': 'log',
 'arthrit_f': 'log',
 'center': 'raiz4',
 'horm_f': 'log',
 'urinate_f': 'log',
 'hyperten_f': 'log',
 'rndyear': 'raiz4',
 'bcontr_f': 'log',
 'height_f': 'sqrt',
 'mortality_exitage': 'log',
 'asppd': 'log',
 'liver_exitage': 'log',
 'hyster_f': 'log',
 'ibuppd': 'log',
 'pipe': 'log',
 'cigar': 'log',
 'miscar': 'log',
 'vasect_f': 'log',
 'bbd': 'log',
 'hearta_f': 'log',
 'enlpros_f': 'log',
 'gallblad_f': 'log',
 'tuballig': 'log',
 'uterine_fib': 'log',
 'diabetes_f': 'log',
 'liver_fh': 'log',
 'emphys_f': 'log',
 'liver_fh_cnt': 'log',
 'endometriosis': 

In [87]:
# Aplicar las transformaciones al conjunto de entrenamiento
transformed_removing_not_important_variables_train_df = apply_best_transformations(imputed_removing_not_important_variables_train_df, transformations2, target)

In [88]:
transformed_removing_not_important_variables_train_df.head()

Unnamed: 0,liver_exitstat_sqrt,filtered_f_log,race7_log,bmi_20c_raiz4,arm_cuarta,sex_cuarta,mortality_exitstat_log,smokea_f_raiz4,cig_stat_log,ssmokea_f_sqrt,bmi_50c_raiz4,bmi_curc_raiz4,preg_f_log,agelevel_log,menstrs_log,fmenstr_log,sisters_log,brothers_log,arthrit_f_log,center_raiz4,horm_f_log,urinate_f_log,hyperten_f_log,rndyear_raiz4,bcontr_f_log,height_f_sqrt,mortality_exitage_log,asppd_log,liver_exitage_log,hyster_f_log,ibuppd_log,pipe_log,cigar_log,miscar_log,vasect_f_log,bbd_log,hearta_f_log,enlpros_f_log,gallblad_f_log,tuballig_log,uterine_fib_log,diabetes_f_log,liver_fh_log,emphys_f_log,liver_fh_cnt_log,endometriosis_log,benign_ovcyst_log,ph_any_trial_log,colon_comorbidity_log,surg_any_log,liver_comorbidity_log,divertic_f_log,osteopor_f_log,tubal_log,surg_prostatectomy_log,polyps_f_log,ph_liver_trial_exp,stroke_f_log,infpros_f_log
0,2.82842712,0.0001,0.0001,1.0,16,1,0.69319718,2.05976714,-9.21034037,6.52143406,1.18920712,1.18920712,-9.21034037,0.0001,-9.21034037,-9.21034037,0.0001,-9.21034037,-9.21034037,1.68179283,-9.21034037,0.0001,-9.21034037,6.68573057,-9.21034037,8.30662386,4.41884181,1.79177614,4.30406644,-9.21034037,1.79177614,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,1.0,-9.21034037,-9.21034037
1,2.44948974,0.0001,0.0001,1.18920712,16,16,0.0001,2.21336384,0.0001,6.52143406,1.31607401,1.41421356,0.0001,-9.21034037,0.0001,0.69319718,0.0001,1.38631936,-9.21034037,1.41421356,0.0001,-9.21034037,0.0001,6.68573057,0.0001,7.74596669,4.26268129,1.60945791,4.14313631,-9.21034037,1.79177614,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,0.0001,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,1.0,-9.21034037,-9.21034037
2,2.82842712,0.0001,0.0001,1.18920712,1,16,1.09864562,2.05976714,-9.21034037,6.52143406,1.41421356,1.41421356,0.0001,-9.21034037,0.0001,0.0001,0.69319718,0.0001,0.0001,1.56508458,0.0001,-9.21034037,0.0001,6.68405684,0.0001,8.0,4.27666751,1.60945791,4.23410795,-9.21034037,0.69319718,-9.21034037,-9.21034037,0.69319718,-9.21034037,0.0001,-9.21034037,-9.21034037,-9.21034037,-9.21034037,0.0001,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,1.0,-9.21034037,-9.21034037
3,2.23606798,0.0001,0.0001,1.18920712,1,1,0.0001,2.05976714,-9.21034037,6.52143406,1.31607401,1.18920712,-9.21034037,0.69319718,-9.21034037,-9.21034037,0.69319718,0.0001,-9.21034037,1.73205081,-9.21034037,0.69319718,0.0001,6.68405684,-9.21034037,7.93725393,4.36944912,-9.21034037,4.36944912,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,0.0001,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,0.0001,2.19723569,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,1.0,-9.21034037,-9.21034037
4,2.82842712,0.0001,0.0001,1.18920712,1,16,0.0001,2.11474253,0.69319718,6.32455532,1.18920712,1.18920712,0.0001,1.09864562,0.69319718,0.69319718,-9.21034037,0.69319718,0.0001,1.56508458,0.0001,-9.21034037,-9.21034037,6.68740305,-9.21034037,8.0,4.48863749,-9.21034037,4.38202788,0.0001,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,-9.21034037,1.0,-9.21034037,-9.21034037


# Transformation comparison

In [90]:
y = target

# Dividir en conjuntos de entrenamiento y prueba
X1_train, X1_test, y1_train, y1_test = train_test_split(transformed_removing_below_mean_train_df, y, test_size=0.2, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(transformed_removing_not_important_variables_train_df, y, test_size=0.2, random_state=42)

In [91]:
daf.nulls_Percentage(transformed_removing_below_mean_train_df)

liver_exitstat_sqrt , 0.0% nulls , 7 unique values, float64
race7_log , 0.0% nulls , 7 unique values, float64
filtered_f_log , 0.0% nulls , 3 unique values, float64
bmi_20c_raiz4 , 0.0% nulls , 4 unique values, float64
sex_cuarta , 0.0% nulls , 2 unique values, int64
arm_sqr , 0.0% nulls , 2 unique values, int64
mortality_exitstat_log , 0.0% nulls , 4 unique values, float64
smokea_f_raiz4 , 0.0% nulls , 63 unique values, float64
cig_stat_exp , 0.0% nulls , 3 unique values, float64
bmi_50c_raiz4 , 0.0% nulls , 4 unique values, float64
ssmokea_f_sqrt , 0.0% nulls , 68 unique values, float64
bmi_curc_raiz4 , 0.0% nulls , 4 unique values, float64
preg_f_log , 0.0% nulls , 3 unique values, float64
agelevel_log , 0.0% nulls , 4 unique values, float64


In [93]:
daf.nulls_Percentage(transformed_removing_not_important_variables_train_df)

liver_exitstat_sqrt , 0.0% nulls , 7 unique values, float64
filtered_f_log , 0.0% nulls , 3 unique values, float64
race7_log , 0.0% nulls , 7 unique values, float64
bmi_20c_raiz4 , 0.0% nulls , 4 unique values, float64
arm_cuarta , 0.0% nulls , 2 unique values, int64
sex_cuarta , 0.0% nulls , 2 unique values, int64
mortality_exitstat_log , 0.0% nulls , 4 unique values, float64
smokea_f_raiz4 , 0.0% nulls , 63 unique values, float64
cig_stat_log , 0.0% nulls , 3 unique values, float64
ssmokea_f_sqrt , 0.0% nulls , 68 unique values, float64
bmi_50c_raiz4 , 0.0% nulls , 4 unique values, float64
bmi_curc_raiz4 , 0.0% nulls , 4 unique values, float64
preg_f_log , 0.0% nulls , 3 unique values, float64
agelevel_log , 0.0% nulls , 4 unique values, float64
menstrs_log , 0.0% nulls , 5 unique values, float64
fmenstr_log , 0.0% nulls , 6 unique values, float64
sisters_log , 0.0% nulls , 8 unique values, float64
brothers_log , 0.0% nulls , 8 unique values, float64
arthrit_f_log , 0.0% nulls , 2 un

In [94]:
# Modelo 1: Random Forest usando df1 (con las transformaciones de df1)
model1 = RandomForestClassifier(random_state=42)
model1.fit(X1_train, y1_train)
y1_pred = model1.predict(X1_test)

# Modelo 2: Random Forest usando df2 (con las transformaciones de df2)
model2 = RandomForestClassifier(random_state=42)
model2.fit(X2_train, y2_train)
y2_pred = model2.predict(X2_test)

In [95]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluación del Modelo 1
accuracy1 = accuracy_score(y1_test, y1_pred)
f1_1 = f1_score(y1_test, y1_pred)
precision1 = precision_score(y1_test, y1_pred)
recall1 = recall_score(y1_test, y1_pred)
auc_roc1 = roc_auc_score(y1_test, y1_pred)

# Evaluación del Modelo 2
accuracy2 = accuracy_score(y2_test, y2_pred)
f1_2 = f1_score(y2_test, y2_pred)
precision2 = precision_score(y2_test, y2_pred)
recall2 = recall_score(y2_test, y2_pred)
auc_roc2 = roc_auc_score(y2_test, y2_pred)

# Mostrar los resultados
print("Modelo 1 (df1) - Accuracy:", accuracy1)
print("Modelo 1 (df1) - F1-Score:", f1_1)
print("Modelo 1 (df1) - Recall:", recall1)
print("Modelo 1 (df1) - AUC-ROC:", auc_roc1)
print("Modelo 1 (df1) - Matriz de Confusión:\n", confusion_matrix(y1_test, y1_pred))

print("\nModelo 2 (df2) - Accuracy:", accuracy2)
print("Modelo 2 (df2) - F1-Score:", f1_2)
print("Modelo 2 (df2) - Recall:", recall2)
print("Modelo 2 (df2) - AUC-ROC:", auc_roc2)
print("Modelo 2 (df2) - Matriz de Confusión:\n", confusion_matrix(y2_test, y2_pred))

Modelo 1 (df1) - Accuracy: 1.0
Modelo 1 (df1) - F1-Score: 1.0
Modelo 1 (df1) - Recall: 1.0
Modelo 1 (df1) - AUC-ROC: 1.0
Modelo 1 (df1) - Matriz de Confusión:
 [[23861     0]
 [    0    38]]

Modelo 2 (df2) - Accuracy: 1.0
Modelo 2 (df2) - F1-Score: 1.0
Modelo 2 (df2) - Recall: 1.0
Modelo 2 (df2) - AUC-ROC: 1.0
Modelo 2 (df2) - Matriz de Confusión:
 [[23861     0]
 [    0    38]]


In [112]:
## Si se obtiene el mismo resultado pues seria mejor pillar el que menos variables tiene no? Aun asi voy a probar con los dos

In [114]:
transformed_removing_below_mean_train_df.to_csv("../0. Data/4. Transformed/transformed_removing_below_mean_train_df.csv", index=False)
transformed_removing_not_important_variables_train_df.to_csv("../0. Data/4. Transformed/transformed_removing_not_important_variables_train_df.csv", index=False)

# Applying same transformation to test_df

In [117]:
## Imputed dataset
imputed_test_df = pd.read_csv('../0. Data/3. Imputed/mean_median_imputed_test_df.csv')
imputed_test_copy1_df = imputed_test_df.copy()
imputed_test_copy2_df = imputed_test_df.copy()

In [119]:
imputed_removing_below_mean_test_df = imputed_test_copy1_df[columns_to_remove_1]
imputed_removing_not_important_variables_test_df = imputed_test_copy2_df[columns_to_remove_2]

In [121]:
# Aplicar las transformaciones al conjunto de entrenamiento
transformed_removing_below_mean_test_df = apply_best_transformations(imputed_removing_below_mean_test_df, transformations, target)

In [123]:
# Aplicar las transformaciones al conjunto de entrenamiento
transformed_removing_not_important_variables_test_df = apply_best_transformations(imputed_removing_not_important_variables_test_df, transformations2, target)

In [125]:
imputed_removing_below_mean_test_df.to_csv("../0. Data/4. Transformed/transformed_removing_below_mean_test_df.csv", index=False)
imputed_removing_not_important_variables_test_df.to_csv("../0. Data/4. Transformed/transformed_removing_not_important_variables_test_df.csv", index=False)