# Actividad de práctica Barin Tumor

**Gonzalo Cano Padilla**

Pasos para resolverlo:
* Primero cargar los datos
* Luego separar X y y
* Transformar las variables categoricas
* Definir el modelo SVC
* Hacer el pipeline
* Optimización bayesiana
*

In [1]:
# Librerias
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler

In [2]:
# Cargar datos
datos = pd.read_csv('brain_tumor_dataset.csv', index_col=0)
datos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 1 to 20000
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  20000 non-null  int64  
 1   Gender               20000 non-null  object 
 2   Tumor_Type           20000 non-null  object 
 3   Tumor_Size           20000 non-null  float64
 4   Location             20000 non-null  object 
 5   Histology            20000 non-null  object 
 6   Stage                20000 non-null  object 
 7   Symptom_1            20000 non-null  object 
 8   Symptom_2            20000 non-null  object 
 9   Symptom_3            20000 non-null  object 
 10  Radiation_Treatment  20000 non-null  object 
 11  Surgery_Performed    20000 non-null  object 
 12  Chemotherapy         20000 non-null  object 
 13  Survival_Rate        20000 non-null  float64
 14  Tumor_Growth_Rate    20000 non-null  float64
 15  Family_History       20000 non-null  obje

In [3]:
datos.head()

Unnamed: 0_level_0,Age,Gender,Tumor_Type,Tumor_Size,Location,Histology,Stage,Symptom_1,Symptom_2,Symptom_3,Radiation_Treatment,Surgery_Performed,Chemotherapy,Survival_Rate,Tumor_Growth_Rate,Family_History,MRI_Result,Follow_Up_Required
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,73,Male,Malignant,5.375612,Temporal,Astrocytoma,III,Vision Issues,Seizures,Seizures,No,No,No,51.312579,0.111876,No,Positive,Yes
2,26,Male,Benign,4.847098,Parietal,Glioblastoma,II,Headache,Headache,Nausea,Yes,Yes,Yes,46.373273,2.165736,Yes,Positive,Yes
3,31,Male,Benign,5.588391,Parietal,Meningioma,I,Vision Issues,Headache,Seizures,No,No,No,47.072221,1.884228,No,Negative,No
4,29,Male,Malignant,1.4366,Temporal,Medulloblastoma,IV,Vision Issues,Seizures,Headache,Yes,No,Yes,51.853634,1.283342,Yes,Negative,No
5,54,Female,Benign,2.417506,Parietal,Glioblastoma,I,Headache,Headache,Seizures,No,No,Yes,54.708987,2.069477,No,Positive,Yes


In [4]:
# Separar X y y
y = datos['Tumor_Type']
X = datos.drop(columns=['Tumor_Type'])

In [5]:
#Sacar el tipo de columnas
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocesamiento
numerical_transformer = StandardScaler()
preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('num', numerical_transformer, num_features)
    ]
)

In [6]:
# Definir el SVC y el pipeline
svc = SVC(kernel='linear', probability=True, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('SVC', svc)
])

In [8]:
# Cross validation y AUC
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

auc = make_scorer(roc_auc_score, needs_proba=True, greater_is_better=True)

# Probar
scores = cross_val_score(pipeline, X, y, scoring=auc, cv=cv, n_jobs=-1)

print("AUC por fold:", scores)
print("AUC promedio:", scores.mean())



AUC por fold: [0.5        0.5        0.5004955  0.50361153 0.50981659 0.5
 0.50555055 0.49288394 0.49633297 0.5       ]
AUC promedio: 0.5008691078219705


In [9]:
# Definir función objetivo
def f_obj(logC):
    valor_C = 10**logC
    svc = SVC(kernel='linear', probability=True, random_state=42, C=valor_C)
    pipeline = Pipeline(steps=[
    ('preprocessor', preprocess),
    ('SVC', svc)
])
    scores = cross_val_score(pipeline, X, y, scoring=auc, cv=cv, n_jobs=-1)

    return scores.mean()


In [10]:
%pip install bayesian_optimization

Collecting bayesian_optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Downloading bayesian_optimization-3.1.0-py3-none-any.whl (36 kB)
Installing collected packages: bayesian_optimization
Successfully installed bayesian_optimization-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from bayes_opt import BayesianOptimization
# Optimización bayesiana
optimizer = BayesianOptimization(
    f=f_obj,           # función objetivo
    pbounds={'logC': (-4, 2)},  # rango de búsqueda
    random_state=42,
    verbose=2
)

optimizer.maximize(init_points=3, n_iter=7)

print("Mejor resultado encontrado:")
print(optimizer.max)

|   iter    |  target   |   logC    |
-------------------------------------
