<a href="https://colab.research.google.com/github/MiguelAngeloTr/Datathon/blob/main/Exploring_Models_GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Libraries

In [None]:
# === 1. IMPORTS ===
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
from imblearn.over_sampling import SMOTE

seed = 45

In [None]:
# Display and Print Full Functions
def display_full(x, max_row=None):
    pd.set_option('display.max_rows', max_row)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:5,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(f'Shape: {x.shape}')
    display(x.head(max_row))
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

def print_full(x, max_row=None):

    pd.set_option('display.max_rows', max_row)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:5,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(f'Shape: {x.shape}')
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')


In [None]:
# Open File
file_dir = "data_Tipo_Tejido_adjusted.csv"
df_raw = pd.read_csv(file_dir)
len(df_raw.columns)

106

### Creating Datasets

In [None]:
# Preprocesar Dataset y Seaprar en Training y Test Datasets

# Elimnar Columnas Reduntantes (Según análisis de correlación)
cols_to_drop = [
    'original_glrlm_ShortRunEmphasis',
    'original_glszm_ZonePercentage',
    'original_glrlm_RunPercentage',
    'original_glszm_SizeZoneNonUniformityNormalized',
    'original_glrlm_RunLengthNonUniformityNormalized',
    'original_gldm_LargeDependenceEmphasis',
    'original_gldm_SmallDependenceEmphasis',
    'original_glcm_Idm',
    'original_glcm_Id',
    'original_glcm_InverseVariance',
    'original_gldm_DependenceNonUniformityNormalized',
    'original_gldm_DependenceVariance',
    'original_glcm_DifferenceEntropy',
    'original_firstorder_RobustMeanAbsoluteDeviation',
    'original_glrlm_HighGrayLevelRunEmphasis',
    'original_gldm_HighGrayLevelEmphasis',
    'original_glcm_Autocorrelation',
    'original_gldm_SmallDependenceHighGrayLevelEmphasis',
    'original_glrlm_GrayLevelVariance'
]
df = df_raw.drop(columns=cols_to_drop) # Nos quedan 87 columnas, 86 de features y 1 de target.
# df = df_raw

# Codificar Variables Categóricas en Números
df.replace({'Mammography_Equipment':{"SIEMENS_Mammomat Inspiration":0,"SIEMENS_MAMMOMAT Revelation":1}, "Tipo_Tejido":{'A':0, 'B':1,'C':2, 'D': 3}}, inplace=True)

# Separar Variables y Target
X = df.drop(columns=['Tipo_Tejido'])
y = df['Tipo_Tejido'] # Separar Features (X) de Target (y)

X_train_raw, X_test, y_train_raw, y_test = train_test_split(X,y, test_size=0.2, stratify=y, random_state=seed)

smote = SMOTE(random_state=seed);
X_train, y_train = smote.fit_resample(X_train_raw, y_train_raw)

  df.replace({'Mammography_Equipment':{"SIEMENS_Mammomat Inspiration":0,"SIEMENS_MAMMOMAT Revelation":1}, "Tipo_Tejido":{'A':0, 'B':1,'C':2, 'D': 3}}, inplace=True)


### Training

In [None]:
models = {
    'LogisticRegression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__penalty': ['l2'],
            'model__solver': ['lbfgs']
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'model__n_neighbors': [3, 5, 7],
            'model__weights': ['uniform', 'distance']
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'model__max_depth': [5, 10, 15],
            'model__criterion': ['gini', 'entropy']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'model__n_estimators': [100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf']
        }
    }
}


In [None]:
# === 3. CROSS VALIDATION SETUP ===
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# === 4. ENTRENAMIENTO Y COMPARACIÓN ===
results = {}

for name, cfg in models.items():
    print(f"🔍 Entrenando y buscando hiperparámetros para {name}...")

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', cfg['model'])
    ])

    grid = GridSearchCV(
        estimator=pipe,
        param_grid=cfg['params'],
        scoring='f1_macro',  # o 'accuracy', 'roc_auc_ovr'
        cv=cv,
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)

    results[name] = {
        'best_params': grid.best_params_,
        'cv_best_score': grid.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred),
        'test_f1_macro': f1_score(y_test, y_pred, average='macro'),
        'test_roc_auc_ovr': roc_auc_score(y_test, y_proba, multi_class='ovr')
    }

print("📊 COMPARACIÓN FINAL DE MODELOS:\n")
for name, res in results.items():
    print(f"Modelo: {name}")
    print(f"  - Mejor CV Accuracy:     {res['cv_best_score']:.4f}")
    print(f"  - Test Accuracy:         {res['test_accuracy']:.4f}")
    print(f"  - Test F1 Macro:         {res['test_f1_macro']:.4f}")
    print(f"  - Test ROC AUC (OVR):    {res['test_roc_auc_ovr']:.4f}")
    print(f"  - Mejores Hiperparámetros: {res['best_params']}")
    print("--------------------------------------------------")


🔍 Entrenando y buscando hiperparámetros para LogisticRegression...
🔍 Entrenando y buscando hiperparámetros para KNN...
🔍 Entrenando y buscando hiperparámetros para DecisionTree...
🔍 Entrenando y buscando hiperparámetros para RandomForest...
🔍 Entrenando y buscando hiperparámetros para SVM...
📊 COMPARACIÓN FINAL DE MODELOS:

Modelo: LogisticRegression
  - Mejor CV Accuracy:     0.7786
  - Test Accuracy:         0.6611
  - Test F1 Macro:         0.6521
  - Test ROC AUC (OVR):    0.8851
  - Mejores Hiperparámetros: {'model__C': 10, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
--------------------------------------------------
Modelo: KNN
  - Mejor CV Accuracy:     0.8308
  - Test Accuracy:         0.6421
  - Test F1 Macro:         0.6102
  - Test ROC AUC (OVR):    0.8388
  - Mejores Hiperparámetros: {'model__n_neighbors': 3, 'model__weights': 'distance'}
--------------------------------------------------
Modelo: DecisionTree
  - Mejor CV Accuracy:     0.7652
  - Test Accuracy:       

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=['A', 'B', 'C', 'D']))


              precision    recall  f1-score   support

           A       0.39      0.57      0.46        47
           B       0.78      0.69      0.73       262
           C       0.67      0.70      0.68       129
           D       0.68      0.70      0.69        37

    accuracy                           0.68       475
   macro avg       0.63      0.67      0.64       475
weighted avg       0.70      0.68      0.69       475



### Testing