In [212]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [213]:
X, y = fetch_openml(
    data_id=1104,          # ID de Leukemia
    return_X_y=True,
    as_frame=True
)

print("shape Características :", X.shape)
print("shape Etiquetas       :", y.shape)

print("Tipo X :", type(X))
print("Tipo y :", type(y))

shape Características : (72, 7129)
shape Etiquetas       : (72,)
Tipo X : <class 'pandas.core.frame.DataFrame'>
Tipo y : <class 'pandas.core.series.Series'>


In [214]:
X.head()

Unnamed: 0,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,AFFX-BioB-5_st,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,-214,-153,-58,88,-295,-558,199,-176,252,206,...,185,511,-125,389,-37,793,329,36,191,-37
1,-139,-73,-1,283,-264,-400,-330,-168,101,74,...,169,837,-36,442,-17,782,295,11,76,-14
2,-76,-49,-307,309,-376,-650,33,-367,206,-215,...,315,1199,33,168,52,1138,777,41,228,-41
3,-135,-114,265,12,-419,-585,158,-253,49,31,...,240,835,218,174,-110,627,170,-50,126,-91
4,-106,-125,-76,168,-230,-284,4,-122,70,252,...,156,649,57,504,-26,250,314,14,56,-25


### Mapear etiquetas

In [215]:
y = y.map({'AML': 0, 'ALL': 1})
y.head()

0    1
1    1
2    1
3    1
4    1
Name: CLASS, dtype: category
Categories (2, int64): [1, 0]

### Normalizar las Características

In [216]:
escaler = MinMaxScaler()
X_scaled = pd.DataFrame(escaler.fit_transform(X), columns=X.columns)

In [217]:
X_scaled.head()

Unnamed: 0,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,AFFX-BioB-5_st,...,U48730_at,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,0.466192,0.72973,0.487535,0.265525,0.375573,0.366279,0.708071,0.634956,0.928074,0.500743,...,0.515924,0.268886,0.331384,0.161897,0.672794,0.322954,0.322328,0.754658,0.079423,0.570896
1,0.599644,0.88417,0.566482,0.683084,0.422901,0.59593,0.405266,0.652655,0.577726,0.402675,...,0.481953,0.477593,0.504873,0.182673,0.746324,0.31806,0.292789,0.677019,0.044158,0.656716
2,0.711744,0.930502,0.142659,0.738758,0.251908,0.232558,0.613051,0.212389,0.821346,0.187964,...,0.791932,0.709347,0.639376,0.075265,1.0,0.476423,0.711555,0.770186,0.09077,0.55597
3,0.606762,0.805019,0.934903,0.102784,0.18626,0.327035,0.684602,0.464602,0.457077,0.370728,...,0.632696,0.476312,1.0,0.077617,0.404412,0.24911,0.184188,0.487578,0.059491,0.369403
4,0.658363,0.783784,0.462604,0.436831,0.474809,0.764535,0.596451,0.754425,0.5058,0.534918,...,0.454352,0.357234,0.68616,0.206978,0.713235,0.081406,0.309296,0.686335,0.038025,0.615672


### Partición fija 80 20 para el df

In [218]:
df = pd.concat([X_scaled, y], axis = 1)
df = df.sample(frac = 1, random_state=42).reset_index(drop = True)
df.head()   

Unnamed: 0,AFFX-BioB-5_at,AFFX-BioB-M_at,AFFX-BioB-3_at,AFFX-BioC-5_at,AFFX-BioC-3_at,AFFX-BioDn-5_at,AFFX-BioDn-3_at,AFFX-CreX-5_at,AFFX-CreX-3_at,AFFX-BioB-5_st,...,U58516_at,U73738_at,X06956_at,X16699_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at,CLASS
0,0.658363,0.783784,0.462604,0.436831,0.474809,0.764535,0.596451,0.754425,0.5058,0.534918,...,0.357234,0.68616,0.206978,0.713235,0.081406,0.309296,0.686335,0.038025,0.615672,1
1,0.560498,0.610039,0.504155,0.389722,0.563359,0.311047,0.524327,0.269912,0.740139,0.238484,...,0.350832,0.522417,0.224226,0.665441,0.330071,0.42311,0.636646,0.08525,0.768657,0
2,0.768683,0.926641,0.706371,0.520343,0.603053,0.856105,0.641671,0.586283,0.422274,0.332838,...,0.195903,0.481481,0.769502,0.742647,0.20952,0.114683,0.487578,0.122968,0.619403,1
3,0.466192,0.72973,0.487535,0.265525,0.375573,0.366279,0.708071,0.634956,0.928074,0.500743,...,0.268886,0.331384,0.161897,0.672794,0.322954,0.322328,0.754658,0.079423,0.570896,1
4,0.873665,0.805019,0.570637,0.490364,0.748092,0.952035,0.610761,0.792035,0.440835,0.736999,...,0.256722,0.680312,0.117993,0.761029,0.094306,0.080799,0.661491,0.782582,0.701493,0


In [201]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42)

print("shape X_train :", X_train.shape)
print("shape y_train :", y_train.shape)    
print("shape X_test  :", X_test.shape)
print("shape y_test  :", y_test.shape)    


shape X_train : (57, 7129)
shape y_train : (57,)
shape X_test  : (15, 7129)
shape y_test  : (15,)


In [219]:
print("shape y_train :", y_train.value_counts())
print('')
print("shape y_test  :", y_test.value_counts())

shape y_train : CLASS
1    39
0    39
Name: count, dtype: int64

shape y_test  : CLASS
1    10
0     5
Name: count, dtype: int64


In [222]:
print(f'IR entrenamiento : {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}')
print(f'IR prueba        : {y_test.value_counts()[0] / y_test.value_counts()[1]:.2f}')

IR entrenamiento : 1.00
IR prueba        : 0.50


### Crear df_train y df_test

In [224]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

### Balancear el df de entrenamiento

* Este dataset no es necesario

In [225]:
df_train['CLASS'].value_counts()

CLASS
1    39
0    39
Name: count, dtype: int64

In [226]:
X_train = df_train.drop(columns = ['CLASS'])
y_train = df_train['CLASS']

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f'shape antes de SMOTE: {X_train.shape}')
print(f'shape despueés de SMOTE: {X_train_smote.shape}')

y_train_smote.value_counts()

shape antes de SMOTE: (78, 7129)
shape despueés de SMOTE: (78, 7129)


CLASS
1    39
0    39
Name: count, dtype: int64

In [227]:
df_train = pd.concat([X_train_smote, y_train_smote], axis = 1).reset_index(drop = True)

* Ya se tiene un conjunto de entrenamiento y otro de pruebas
* Ambos están normalizados con MaxMin
* Para el conjunto de pruebas se balancearon las clases con un SMOTE

- Ya se puede entrenar modelos con el conjunto de entrenamiento
- El conjunto desbalanceado de pruebas ya se puede usar para evaluación al final

In [228]:
df_train.to_csv('../data/df_train.csv', index=False)
df_test.to_csv('../data/df_test.csv', index=False)