### Seminario Estadística para Radiología
#### Parte 2

---


* Héctor Henríquez MD, MS

In [None]:
### Librerías
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import requests
from collections import Counter

# Visualización
from plotnine import*
import matplotlib.pyplot as plt
import seaborn as sns

# Modelamiento de datos
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

## Sistema
import warnings
warnings.filterwarnings("ignore", category = FutureWarning)

In [3]:

def plot_confusion_matrix(y_true, y_pred, class_names=None, normalize=None, title='Confusion Matrix', cmap='Blues'):
    
    # Calcular matriz de confusión
    cm = confusion_matrix(y_true, y_pred, labels=class_names, normalize=normalize)
    
    # Si class_names no se pasa, se calculan a partir de las clases presentes
    if class_names is None:
        class_names = np.unique(np.concatenate((y_true, y_pred)))

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='.2f' if normalize else 'd', cmap=cmap,
                 xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels', fontsize=12)
    plt.ylabel('True Labels', fontsize=12)
    plt.title(title, fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
data_url = 'https://raw.githubusercontent.com/HectorHenriquez/Curso_Investigacion/main/datasets/COLORECTAL_LIVER_METS.xlsx'
data = pd.read_excel(data_url)
print("Dimensiones set de datos:", data.shape)
data.head(5)

In [None]:
## ¿Cuántos pacientes presentaron progresión o recurrencia hepática?
Counter(data.progression_or_recurrence_liveronly)

In [None]:
data.columns

In [None]:
## Selección de variables para modelo

variables_X = ['age', 'sex','major_comorbidity', 'body_mass_index', 'node_positive_primary',
       'synchronous_crlm', 'multiple_metastases', 'clinrisk_score',
       'clinrisk_stratified', 'carcinoembryonic_antigen', 'max_tumor_size',
       'bilobar_disease', 'extrahep_disease', 'chemo_before_liver_resection',
       'preoperative_pve', 'steatosis_yesno', 'presence_sinusoidal_dilata',
       'NASH_score', 'total_response_percent','necrosis_percent', 'fibrosis_percent']

target = 'progression_or_recurrence_liveronly'

In [None]:
X = data.loc[:,variables_X]
y = data.loc[:,target]
X.head()

In [None]:
## Separación de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
### Escalado de los valores
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
### Ajustar modelo

model = LogisticRegression()
model.fit(X_train_scaled, y_train)


In [None]:
y_pred = model.predict_proba(X_test_scaled)
y_pred[0:10]

In [None]:
umbral = 0.45
y_pred_bin = np.where(y_pred[:,1] > umbral, 1, 0)

In [None]:
print(classification_report(y_test, y_pred_bin))

In [None]:
plot_confusion_matrix(y_test, y_pred_bin)