### ANÁLISIS PREDICTIVO QUIMIO

In [259]:
import pandas as pd 
import numpy as np
import tensorflow as tf 
from tensorflow.keras import layers
from tensorflow.keras import models
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

En este notebook, se abordará un análisis predictivo que se centra en predecir si el tratamiento de quimioterapia dado tiene efecto positivo sobre el paciente o no. Para ello, emplearemos un modelo de regresión logística y una red neuronal

In [260]:
df = pd.read_csv('df_final.csv')
df = df.drop('Unnamed: 0',axis=1)
df

Unnamed: 0,ehr,birth_date,diagnosis_date,death_date,age,er,her2,ki67,pr,pregnancy,...,m_category,t_category_after_neoadj,n_category_after_neoadj,m_category_after_neoadj,stage_diagnosis,stage_after_neo,grade,ductal,lobular,neoadjuvant
0,10011773,05-07-1959,02-04-2015,,65,0.0,0.0,19.0,1.0,0.0,...,0.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,0,SIN TRATAMIENTO,1.0,0.0,0.0,0.0
1,10020495,02-10-1953,04-12-2017,,71,1.0,0.0,9.0,0.0,0.0,...,0.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,IA,SIN TRATAMIENTO,2.0,1.0,0.0,0.0
2,10030299,16-08-1966,27-06-2019,,58,1.0,0.0,18.0,1.0,3.0,...,0.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,IA,SIN TRATAMIENTO,1.0,1.0,0.0,0.0
3,10030824,03-03-1953,07-09-2018,,71,1.0,0.0,18.0,1.0,0.0,...,0.0,2,2.0,0.0,IIIA,IIIA,3.0,0.0,1.0,1.0
4,10041592,06-07-1959,19-11-2018,,65,0.0,0.0,65.0,0.0,0.0,...,0.0,4,0.0,0.0,IB,IA,2.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11162,99343880,23-08-1944,28-05-2018,,80,1.0,0.0,68.0,1.0,2.0,...,1.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,IA,SIN TRATAMIENTO,2.0,0.0,0.0,0.0
11163,99443402,17-06-1959,21-01-2018,,65,1.0,0.0,22.0,0.0,0.0,...,1.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,IIB,SIN TRATAMIENTO,3.0,0.0,1.0,0.0
11164,99656792,26-03-1976,23-07-2017,,48,1.0,1.0,49.0,0.0,0.0,...,0.0,4,0.0,0,IA,SIN TRATAMIENTO,1.0,0.0,0.0,1.0
11165,99690760,21-07-1961,28-10-2013,,63,1.0,0.0,20.0,1.0,1.0,...,0.0,SIN TRATAMIENTO,SIN TRATAMIENTO,SIN TRATAMIENTO,IA,SIN TRATAMIENTO,1.0,1.0,0.0,0.0


Para este análisis predictivo, las variables de fecha de nacimiento,ehr,fecha de muerte no son relevantes para la predicción, así como las variables de t,n,m, ya que vienen dadas conjuntamente en la variable stage diagnosis, por lo que las eliminamos:

In [263]:
datos_modelo = df.copy()
datos_modelo = datos_modelo.drop(['ehr','death_date','diagnosis_date','t_category','n_category','m_category','t_category_after_neoadj',
       'n_category_after_neoadj', 'm_category_after_neoadj'],axis=1)
datos_modelo.columns

Index(['birth_date', 'age', 'er', 'her2', 'ki67', 'pr', 'pregnancy', 'birth',
       'caesarean', 'abort', 'menarche_age', 'menopause_age', 'n_tumor',
       'stage_diagnosis', 'stage_after_neo', 'grade', 'ductal', 'lobular',
       'neoadjuvant'],
      dtype='object')

In [264]:
# fechas = datos_modelo['birth_date'].values
# edades = fechas.copy()
# for i in range(len(fechas)):
#     edades[i] = int(2024 - int(datos_modelo['birth_date'][i][6:]))
# datos_modelo['age'] = edades
# datos_modelo['age'].astype('int')
# datos_modelo = datos_modelo.drop('birth_date',axis=1)
# datos_modelo.dtypes

age                 object
er                 float64
her2               float64
ki67               float64
pr                 float64
pregnancy          float64
birth              float64
caesarean          float64
abort              float64
menarche_age       float64
menopause_age      float64
n_tumor            float64
stage_diagnosis     object
stage_after_neo     object
grade              float64
ductal             float64
lobular            float64
neoadjuvant        float64
dtype: object

A continuación, vamos a realizar una serie de transformaciones a nuestros datos paara que sean adecuados para la entrada de nuestros modelos. Estas transformaciones abarcan desde transformación de tipos hasta mapeados de strings a números mediante técnicas de One-Hot encoding:

In [265]:
datos_modelo['menarche_age'] = datos_modelo['menarche_age'].astype('int')
datos_modelo.dtypes

age                 object
er                 float64
her2               float64
ki67               float64
pr                 float64
pregnancy          float64
birth              float64
caesarean          float64
abort              float64
menarche_age         int32
menopause_age      float64
n_tumor            float64
stage_diagnosis     object
stage_after_neo     object
grade              float64
ductal             float64
lobular            float64
neoadjuvant        float64
dtype: object

In [266]:
pacientes_no_quimio = datos_modelo[datos_modelo['neoadjuvant']==0]
pacientes_no_quimio

Unnamed: 0,age,er,her2,ki67,pr,pregnancy,birth,caesarean,abort,menarche_age,menopause_age,n_tumor,stage_diagnosis,stage_after_neo,grade,ductal,lobular,neoadjuvant
0,65,0.0,0.0,19.0,1.0,0.0,0.0,0.0,0.0,13,49.0,1.0,0,SIN TRATAMIENTO,1.0,0.0,0.0,0.0
1,71,1.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,13,49.0,1.0,IA,SIN TRATAMIENTO,2.0,1.0,0.0,0.0
2,58,1.0,0.0,18.0,1.0,3.0,3.0,0.0,0.0,13,49.0,1.0,IA,SIN TRATAMIENTO,1.0,1.0,0.0,0.0
5,67,1.0,0.0,18.0,1.0,2.0,1.0,0.0,1.0,13,49.0,1.0,IV,SIN TRATAMIENTO,3.0,1.0,0.0,0.0
11,67,1.0,0.0,14.0,0.0,2.0,0.0,0.0,2.0,18,44.0,1.0,IA,SIN TRATAMIENTO,3.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11158,67,1.0,0.0,29.0,0.0,1.0,1.0,0.0,0.0,13,49.0,2.0,IA,SIN TRATAMIENTO,3.0,1.0,0.0,0.0
11160,66,1.0,0.0,52.0,1.0,0.0,0.0,0.0,0.0,13,49.0,2.0,IIA,SIN TRATAMIENTO,1.0,0.0,0.0,0.0
11162,80,1.0,0.0,68.0,1.0,2.0,2.0,0.0,0.0,16,47.0,2.0,IA,SIN TRATAMIENTO,2.0,0.0,0.0,0.0
11163,65,1.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0,13,49.0,2.0,IIB,SIN TRATAMIENTO,3.0,0.0,1.0,0.0


In [267]:
datos_modelo = datos_modelo.replace("SIN TRATAMIENTO",np.nan)
datos_modelo = datos_modelo.dropna()
datos_modelo.dtypes

age                  int64
er                 float64
her2               float64
ki67               float64
pr                 float64
pregnancy          float64
birth              float64
caesarean          float64
abort              float64
menarche_age         int32
menopause_age      float64
n_tumor            float64
stage_diagnosis     object
stage_after_neo     object
grade              float64
ductal             float64
lobular            float64
neoadjuvant        float64
dtype: object

In [268]:
pacientes_quimio = datos_modelo[datos_modelo['neoadjuvant']==1]
pacientes_quimio.iloc[1,:].values

array([65, 0.0, 0.0, 65.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13, 49.0, 1.0, 'IB',
       'IA', 2.0, 1.0, 0.0, 1.0], dtype=object)

In [269]:
dic_estadios = {'0':0,'IA':1,'IB':2,'IIA':3,'IIB':4,'IIIA':5,'IIIB':6,'IIIC':7,'IV':8}
pacientes_quimio = pacientes_quimio.replace(dic_estadios)
target = pacientes_quimio['stage_after_neo'].values
pacientes_quimio = pacientes_quimio.drop('stage_after_neo',axis=1)
x_train = pacientes_quimio.values[:5000]
x_test = pacientes_quimio.values[5000:]

In [283]:
y_train = np.zeros((5000,9))
for i in range(5000):
    y_train[i][target[i]] = 1
y_test = target[5000:]
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.]])

Una vez que nuestros datos están listos, procedemos a la normalización de los datos y la generación de los modelos:

In [271]:
scaler = StandardScaler()
datos_normalizados = scaler.fit_transform(x_train)

In [272]:
regresion = LogisticRegression()
red_neuronal = models.Sequential([
    layers.Input(shape=(17)),
    layers.Dense(32,activation='relu'),
    layers.Dropout(rate=0.25),
    layers.Dense(128,activation='relu'),
    layers.Dense(9,activation='sigmoid')
])

In [279]:
regresion.fit(x_train,target[:5000])
predicciones_reg = regresion.predict(x_test)
precision = accuracy_score(y_test, predicciones_reg)
print(f'Precision regresion: {precision}')

Precision regresion: 0.34229390681003585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [284]:
red_neuronal.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
red_neuronal.fit(x_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1688fb57fd0>

In [285]:
preds = red_neuronal.predict(x_test)



In [286]:
y_pred = np.zeros(len(preds))
for i in range(len(preds)):
    y_pred[i] = np.argmax(preds[i])

array([1., 1., 5., 1., 5., 1., 1., 5., 1., 1., 1., 1., 1., 1., 1., 1., 8.,
       1., 1., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 5., 8., 8., 1., 1., 1., 1., 1., 1., 5., 1., 8., 1., 1., 1., 1.,
       1., 1., 5., 1., 1., 1., 1., 1., 5., 1., 1., 5., 1., 1., 5., 1., 1.,
       1., 1., 1., 1., 1., 5., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 5., 1., 1., 5., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 5.,
       1., 1., 5., 1., 1., 1., 1., 1., 8., 1., 1., 5., 5., 1., 1., 1., 5.,
       5., 5., 1., 5., 1., 1., 1., 5., 1., 1., 1., 5., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 5., 1., 5., 5., 5., 1., 1., 1., 8., 5.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 5., 5., 1., 1., 8., 1., 1., 5., 1., 1., 1., 1., 5., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 5., 1., 1., 1., 5., 1., 5., 1.,
       1., 8., 1., 1., 1., 1., 1., 1., 1., 5., 5., 1., 8., 5., 1., 5., 1.,
       5., 1., 1., 1., 1.

In [288]:
aciertos = 0
for i in range(len(preds)):
    if y_test[i] == y_pred[i]:
        aciertos += 1
accuracy = (aciertos/(len(y_test)))*100
print(f'Accuracy del modelo: {accuracy}%')

Accuracy del modelo: 37.27598566308244%


Tras realizar varias pruebas, no obtenemos muy buenos resultados con ninguno de los modelos. Esto puede deberse a una baja calidad del dato o a la poca cantidad de instancias que tenemos (5558), ya que únicamente podemos utilizar aquellos pacientes que se hayan administrado la quimioterapia y de los que tengamos datos de su evolución para entrenar el modelo, por lo que nuestra muestar se reduce a la mitad. En la red neuronal, este problema causa un overfitting.