# Replicar el resultado de PyCaret usando numpy como funciones personalizadas para el preprocesamiento
Para mejorar las métricas de producción y optimizar las pruebas de estrés a la API, se realizó el mismo procesamiento de datos que se mencionó anteriormente, pero en lugar de utilizar las librerías Pandas, Scikit-learn y Pycaret, se utilizó la librería NumPy debido a su mejor rendimiento en producción.

NumPy es una librería de Python que se utiliza para realizar cálculos numéricos en grandes conjuntos de datos y que está optimizada para una mayor velocidad de procesamiento. Debido a su alta velocidad y eficiencia en el procesamiento de grandes conjuntos de datos, se utiliza comúnmente en aplicaciones de producción y para el procesamiento de datos en tiempo real.

Al utilizar NumPy en lugar de Pandas, Scikit-learn, se logró mejorar el rendimiento y la velocidad de procesamiento del código, lo que es especialmente útil en la aplicación de producción donde el tiempo de respuesta y la eficiencia son factores críticos.

In [244]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

In [224]:
X_train= pd.read_csv('../data/processed/checkpoints/X_train_check1.csv', index_col=[0])
X_test = pd.read_csv('../data/processed/checkpoints/X_test_check1.csv', index_col=[0])
y_train = pd.read_csv('../data/processed/checkpoints/y_train_check1.csv', index_col=[0])
y_test = pd.read_csv('../data/processed/checkpoints/y_testcheck1.csv', index_col=[0])

In [225]:
train = pd.concat([X_train, y_train],1)
test = pd.concat([X_test, y_test],1)

  train = pd.concat([X_train, y_train],1)
  test = pd.concat([X_test, y_test],1)


In [226]:
train.shape, test.shape

((47744, 16), (20462, 16))

In [227]:
# esto tiene que estar en la seccion del enrichment - Cambiarlo desp
train.rename(columns={'km-ovsd':'km_ovsd'}, inplace=True)
test.rename(columns={'km-ovsd':'km_ovsd'}, inplace=True)

In [228]:
train.select_dtypes(exclude='object').columns

Index(['dia', 'mes', 'km_ovsd', 't_media', 'v_media_viento', 'presion_media',
       'cantidad_de_lluvia_mm', 'nubosidad_perc', 'temporada_alta',
       'atraso_15'],
      dtype='object')

In [229]:
# X_train['km_ovsd'] = X_train['km_ovsd'].astype('int')
# X_test['km_ovsd'] = X_test['km_ovsd'].astype('int')

In [230]:
vars_ok = ['dia', 'mes', 'km_ovsd', 't_media', 'v_media_viento', 'presion_media',
       'cantidad_de_lluvia_mm', 'nubosidad_perc', 'temporada_alta', 'atraso_15'] 
train = train[vars_ok].copy()
test = test[vars_ok].copy()

In [231]:
train.head(2)

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta,atraso_15
3762,16,9,1086.84,21.0,6.1,1015.5,13.7,53.5,1,0
64209,9,12,2595.13,,,,1.5,27.5,0,0


In [232]:
type(train)

pandas.core.frame.DataFrame

In [233]:
# pasamos todo a float en train
for col in train.columns:
    train[col] = train[col].astype('float')
train.dtypes

dia                      float64
mes                      float64
km_ovsd                  float64
t_media                  float64
v_media_viento           float64
presion_media            float64
cantidad_de_lluvia_mm    float64
nubosidad_perc           float64
temporada_alta           float64
atraso_15                float64
dtype: object

In [234]:
# pasamos todo a float en test
for col in test.columns:
    test[col] = test[col].astype('float')
test.dtypes

dia                      float64
mes                      float64
km_ovsd                  float64
t_media                  float64
v_media_viento           float64
presion_media            float64
cantidad_de_lluvia_mm    float64
nubosidad_perc           float64
temporada_alta           float64
atraso_15                float64
dtype: object

In [235]:
test

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta,atraso_15
27320,22.0,5.0,440.52,11.0,3.0,1016.2,38.2,52.4,0.0,0.0
57460,3.0,5.0,2194.86,18.0,4.3,1016.7,38.2,52.4,0.0,1.0
54916,10.0,8.0,918.82,10.0,5.8,1016.6,21.8,57.6,0.0,0.0
34252,13.0,1.0,1468.94,30.0,9.5,1014.2,0.0,17.6,1.0,1.0
13524,26.0,4.0,1124.93,15.0,4.5,1020.4,3.9,27.4,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
52356,21.0,1.0,918.82,28.0,7.7,1013.9,0.0,17.6,1.0,0.0
47513,31.0,1.0,172.47,27.0,8.5,1014.1,0.0,17.6,1.0,0.0
21109,21.0,8.0,1229.56,18.0,5.9,1014.8,21.8,57.6,0.0,0.0
45425,24.0,11.0,2466.62,,,,2.0,29.7,0.0,0.0


In [236]:
# X & y
X_train = train.drop('atraso_15',1).copy()
y_train = train[['atraso_15']].copy()

X_test = test.drop('atraso_15',1).copy()
y_test = test[['atraso_15']].copy()

  X_train = train.drop('atraso_15',1).copy()
  X_test = test.drop('atraso_15',1).copy()


In [237]:
# Define the under-sampling strategy
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Apply the under-sampling strategy to the training data
X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)

In [238]:
X_train_resampled.shape, y_train_resampled.shape

((17662, 9), (17662, 1))

In [264]:
X_train

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta
3762,16.0,9.0,1086.84,21.0,6.1,1015.5,13.7,53.5,1.0
64209,9.0,12.0,2595.13,,,,1.5,27.5,0.0
15502,26.0,8.0,1124.93,11.0,5.3,1019.7,21.8,57.6,0.0
26174,10.0,1.0,440.52,25.0,10.1,1014.7,0.0,17.6,1.0
10325,3.0,3.0,4252.23,22.0,6.6,1014.2,1.6,16.4,0.0
...,...,...,...,...,...,...,...,...,...
37194,1.0,11.0,1468.94,14.0,3.6,1016.6,2.0,29.7,0.0
6265,22.0,3.0,1662.19,22.0,6.6,1015.6,1.6,16.4,0.0
54886,27.0,8.0,918.82,12.0,7.7,1017.8,21.8,57.6,0.0
860,24.0,2.0,1086.84,27.0,8.2,1014.3,0.0,9.7,1.0


In [265]:
X_train_resampled

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta
0,4.0,7.0,2466.62,8.0,9.1,1022.0,56.8,59.9,0.0
1,25.0,9.0,1086.84,18.0,6.0,1019.4,13.7,53.5,1.0
2,16.0,1.0,918.82,26.0,9.1,1016.0,0.0,17.6,1.0
3,13.0,1.0,1229.56,30.0,9.5,1014.2,0.0,17.6,1.0
4,26.0,8.0,172.47,11.0,5.3,1019.7,21.8,57.6,0.0
...,...,...,...,...,...,...,...,...,...
17657,19.0,9.0,172.47,11.0,5.6,1020.8,13.7,53.5,1.0
17658,8.0,5.0,2466.62,15.0,3.7,1017.9,38.2,52.4,0.0
17659,1.0,11.0,4252.23,14.0,3.6,1016.6,2.0,29.7,0.0
17660,4.0,12.0,895.71,,,,1.5,27.5,0.0


In [266]:
X_test

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta
27320,22.0,5.0,440.52,11.0,3.0,1016.2,38.2,52.4,0.0
57460,3.0,5.0,2194.86,18.0,4.3,1016.7,38.2,52.4,0.0
54916,10.0,8.0,918.82,10.0,5.8,1016.6,21.8,57.6,0.0
34252,13.0,1.0,1468.94,30.0,9.5,1014.2,0.0,17.6,1.0
13524,26.0,4.0,1124.93,15.0,4.5,1020.4,3.9,27.4,0.0
...,...,...,...,...,...,...,...,...,...
52356,21.0,1.0,918.82,28.0,7.7,1013.9,0.0,17.6,1.0
47513,31.0,1.0,172.47,27.0,8.5,1014.1,0.0,17.6,1.0
21109,21.0,8.0,1229.56,18.0,5.9,1014.8,21.8,57.6,0.0
45425,24.0,11.0,2466.62,,,,2.0,29.7,0.0


In [267]:
X_test.to_numpy()

array([[2.20000e+01, 5.00000e+00, 4.40520e+02, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [3.00000e+00, 5.00000e+00, 2.19486e+03, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [1.00000e+01, 8.00000e+00, 9.18820e+02, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       ...,
       [2.10000e+01, 8.00000e+00, 1.22956e+03, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       [2.40000e+01, 1.10000e+01, 2.46662e+03, ..., 2.00000e+00,
        2.97000e+01, 0.00000e+00],
       [7.00000e+00, 7.00000e+00, 7.49470e+02, ..., 5.68000e+01,
        5.99000e+01, 0.00000e+00]])

In [268]:
X_train_nump = X_train_resampled.to_numpy().copy()
X_test_nump = X_test.to_numpy().copy()
display(X_train_nump)
display(X_train_nump.shape, X_train_nump.ndim)

array([[4.00000e+00, 7.00000e+00, 2.46662e+03, ..., 5.68000e+01,
        5.99000e+01, 0.00000e+00],
       [2.50000e+01, 9.00000e+00, 1.08684e+03, ..., 1.37000e+01,
        5.35000e+01, 1.00000e+00],
       [1.60000e+01, 1.00000e+00, 9.18820e+02, ..., 0.00000e+00,
        1.76000e+01, 1.00000e+00],
       ...,
       [1.00000e+00, 1.10000e+01, 4.25223e+03, ..., 2.00000e+00,
        2.97000e+01, 0.00000e+00],
       [4.00000e+00, 1.20000e+01, 8.95710e+02, ..., 1.50000e+00,
        2.75000e+01, 0.00000e+00],
       [2.90000e+01, 1.20000e+01, 1.08684e+03, ..., 1.50000e+00,
        2.75000e+01, 1.00000e+00]])

(17662, 9)

2

In [270]:
X_test_nump

array([[2.20000e+01, 5.00000e+00, 4.40520e+02, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [3.00000e+00, 5.00000e+00, 2.19486e+03, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [1.00000e+01, 8.00000e+00, 9.18820e+02, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       ...,
       [2.10000e+01, 8.00000e+00, 1.22956e+03, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       [2.40000e+01, 1.10000e+01, 2.46662e+03, ..., 2.00000e+00,
        2.97000e+01, 0.00000e+00],
       [7.00000e+00, 7.00000e+00, 7.49470e+02, ..., 5.68000e+01,
        5.99000e+01, 0.00000e+00]])

In [271]:
# Iterate through each feature and compute its median, min and max values from the training set

def preprocess(data, lst_vars):
    
    # Define a dictionary to store the median, min and max values for each feature
    stats_dict = {}
    
    scaler = lambda x, f_min, f_max: (x - f_min) / (f_max - f_min)
    
    for i, feature in enumerate(lst_vars):
        # Compute median of the feature from the training set
        median = np.nanmedian(data[:, i])
        
        # Compute min and max of the feature from the training set
        f_min = np.nanmin(data[:, i])
        f_max = np.nanmax(data[:, i])
        
        # Add the computed values to the stats dictionary
        stats_dict[feature] = {"median": median, "min": f_min, "max": f_max}
        
        # # Replace missing values with the computed median in both train and test sets
        data[:, i] = np.where(np.isnan(data[:, i]), median, data[:, i])
        data[:, i] = scaler(data[:, i], f_min, f_max)
        # test_data[:, i] = np.where(np.isnan(test_data[:, i]), median, test_data[:, i])

    return data, stats_dict

In [272]:
X_train_nump

array([[4.00000e+00, 7.00000e+00, 2.46662e+03, ..., 5.68000e+01,
        5.99000e+01, 0.00000e+00],
       [2.50000e+01, 9.00000e+00, 1.08684e+03, ..., 1.37000e+01,
        5.35000e+01, 1.00000e+00],
       [1.60000e+01, 1.00000e+00, 9.18820e+02, ..., 0.00000e+00,
        1.76000e+01, 1.00000e+00],
       ...,
       [1.00000e+00, 1.10000e+01, 4.25223e+03, ..., 2.00000e+00,
        2.97000e+01, 0.00000e+00],
       [4.00000e+00, 1.20000e+01, 8.95710e+02, ..., 1.50000e+00,
        2.75000e+01, 0.00000e+00],
       [2.90000e+01, 1.20000e+01, 1.08684e+03, ..., 1.50000e+00,
        2.75000e+01, 1.00000e+00]])

In [273]:
lst_vars = ['dia', 'mes', 'km_ovsd', 't_media', 'v_media_viento', 'presion_media',
       'cantidad_de_lluvia_mm', 'nubosidad_perc', 'temporada_alta']

X_train_nump_proc, stats_dict = preprocess(X_train_nump, lst_vars)

In [274]:
X_train_nump_proc

array([[0.1       , 0.54545455, 0.19559322, ..., 0.68932039, 1.        ,
        0.        ],
       [0.8       , 0.72727273, 0.07795679, ..., 0.16626214, 0.87250996,
        1.        ],
       [0.5       , 0.        , 0.06363184, ..., 0.        , 0.15737052,
        1.        ],
       ...,
       [0.        , 0.90909091, 0.34782965, ..., 0.02427184, 0.39840637,
        0.        ],
       [0.1       , 1.        , 0.06166155, ..., 0.01820388, 0.35458167,
        0.        ],
       [0.93333333, 1.        , 0.07795679, ..., 0.01820388, 0.35458167,
        1.        ]])

In [275]:
stats_dict

{'dia': {'median': 15.0, 'min': 1.0, 'max': 31.0},
 'mes': {'median': 7.0, 'min': 1.0, 'max': 12.0},
 'km_ovsd': {'median': 1229.56, 'min': 172.47, 'max': 11901.66},
 't_media': {'median': 16.0, 'min': 4.0, 'max': 30.0},
 'v_media_viento': {'median': 6.2, 'min': 0.9, 'max': 13.9},
 'presion_media': {'median': 1017.0, 'min': 1009.6, 'max': 1029.6},
 'cantidad_de_lluvia_mm': {'median': 2.0, 'min': 0.0, 'max': 82.4},
 'nubosidad_perc': {'median': 38.2, 'min': 9.7, 'max': 59.9},
 'temporada_alta': {'median': 0.0, 'min': 0.0, 'max': 1.0}}

In [276]:
# sanity check contra el stats_dict
X_train_resampled.describe()

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta
count,17662.0,17662.0,17662.0,14749.0,14749.0,14749.0,17662.0,17662.0,17662.0
mean,15.655362,6.87827,2168.108333,17.017561,6.383294,1017.486657,18.471566,37.743608,0.330597
std,8.64352,3.5142,2367.441235,6.113554,2.163886,3.292857,25.535611,17.190177,0.470441
min,1.0,1.0,172.47,4.0,0.9,1009.6,0.0,9.7,0.0
25%,8.0,4.0,918.82,12.0,4.7,1015.0,1.5,27.4,0.0
50%,15.0,7.0,1229.56,16.0,6.2,1017.0,2.0,38.2,0.0
75%,23.0,10.0,2466.62,22.0,8.0,1019.5,38.2,57.2,1.0
max,31.0,12.0,11901.66,30.0,13.9,1029.6,82.4,59.9,1.0


In [277]:
X_train_nump_proc.shape

(17662, 9)

In [278]:
y_train_resampled.value_counts()

atraso_15
0.0          8831
1.0          8831
dtype: int64

In [279]:
train_labels = y_train_resampled.to_numpy().flatten().copy()
train_labels

array([0., 0., 0., ..., 1., 1., 1.])

In [280]:
train_labels[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [281]:
train_labels.sum()

8831.0

In [282]:
X_train_nump_proc.shape, train_labels.shape

((17662, 9), (17662,))

In [283]:
gbc = GradientBoostingClassifier(random_state=123)
gbc.fit(X_train_nump_proc, train_labels)

In [284]:
train_preds = gbc.predict(X_train_nump_proc)

In [285]:
train_preds.sum()

9205.0

In [286]:
train_preds.ndim

1

In [287]:
train_preds[:10]

array([1., 0., 0., 0., 1., 0., 0., 0., 1., 1.])

In [288]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score

# assume y_true and y_pred are your true labels and predicted labels, respectively
# y_pred should be the predicted probability of class 1 for each sample

acu = accuracy_score(train_labels, train_preds)
# calculate AUC
auc = roc_auc_score(train_labels, train_preds)
# calculate precision
precision = precision_score(train_labels, train_preds)
# calculate recall
recall = recall_score(train_labels, train_preds)
# calculate F1-score
f1 = f1_score(train_labels, train_preds)

print(f"Accuracy: {acu:.3f}")
print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

Accuracy: 0.627
AUC: 0.627
Precision: 0.622
Recall: 0.649
F1-score: 0.635


HACER ESTA EVAL SOBRE TRAIN USANDO CROSSVALIDATION PARA QUE SEA COMPARABLE CON EL RESULTADO EN PYCARET

In [289]:
stats_dict

{'dia': {'median': 15.0, 'min': 1.0, 'max': 31.0},
 'mes': {'median': 7.0, 'min': 1.0, 'max': 12.0},
 'km_ovsd': {'median': 1229.56, 'min': 172.47, 'max': 11901.66},
 't_media': {'median': 16.0, 'min': 4.0, 'max': 30.0},
 'v_media_viento': {'median': 6.2, 'min': 0.9, 'max': 13.9},
 'presion_media': {'median': 1017.0, 'min': 1009.6, 'max': 1029.6},
 'cantidad_de_lluvia_mm': {'median': 2.0, 'min': 0.0, 'max': 82.4},
 'nubosidad_perc': {'median': 38.2, 'min': 9.7, 'max': 59.9},
 'temporada_alta': {'median': 0.0, 'min': 0.0, 'max': 1.0}}

In [290]:
list(range(X_train_nump_proc.shape[1]))

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [291]:
lst_vars

['dia',
 'mes',
 'km_ovsd',
 't_media',
 'v_media_viento',
 'presion_media',
 'cantidad_de_lluvia_mm',
 'nubosidad_perc',
 'temporada_alta']

In [292]:
X_test_nump

array([[2.20000e+01, 5.00000e+00, 4.40520e+02, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [3.00000e+00, 5.00000e+00, 2.19486e+03, ..., 3.82000e+01,
        5.24000e+01, 0.00000e+00],
       [1.00000e+01, 8.00000e+00, 9.18820e+02, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       ...,
       [2.10000e+01, 8.00000e+00, 1.22956e+03, ..., 2.18000e+01,
        5.76000e+01, 0.00000e+00],
       [2.40000e+01, 1.10000e+01, 2.46662e+03, ..., 2.00000e+00,
        2.97000e+01, 0.00000e+00],
       [7.00000e+00, 7.00000e+00, 7.49470e+02, ..., 5.68000e+01,
        5.99000e+01, 0.00000e+00]])

In [293]:
stats_dict

{'dia': {'median': 15.0, 'min': 1.0, 'max': 31.0},
 'mes': {'median': 7.0, 'min': 1.0, 'max': 12.0},
 'km_ovsd': {'median': 1229.56, 'min': 172.47, 'max': 11901.66},
 't_media': {'median': 16.0, 'min': 4.0, 'max': 30.0},
 'v_media_viento': {'median': 6.2, 'min': 0.9, 'max': 13.9},
 'presion_media': {'median': 1017.0, 'min': 1009.6, 'max': 1029.6},
 'cantidad_de_lluvia_mm': {'median': 2.0, 'min': 0.0, 'max': 82.4},
 'nubosidad_perc': {'median': 38.2, 'min': 9.7, 'max': 59.9},
 'temporada_alta': {'median': 0.0, 'min': 0.0, 'max': 1.0}}

In [294]:
# Iterate through each feature and compute its median, min and max values from the training set

def preprocess_test(data, lst_vars, stats_dict):
    
    scaler = lambda x, f_min, f_max: (x - f_min) / (f_max - f_min)
    
    for i, feature in enumerate(lst_vars):
        
        print(f'Iterating feature: {feature}')
    
        #for j in range(train_data.shape[1]):
        median = stats_dict[feature]["median"]
        f_min = stats_dict[feature]["min"]
        f_max = stats_dict[feature]["max"]
        
        print(f'median:{median}, f_min:{f_min}, f_max:{f_max}')
    
        # # Replace missing values with the computed median in both train and test sets
        data[:, i] = np.where(np.isnan(data[:, i]), median, data[:, i])
        data[:, i] = scaler(data[:, i], f_min, f_max)
        # test_data[:, i] = np.where(np.isnan(test_data[:, i]), median, test_data[:, i])

    return data

In [295]:
X_test_nump_proc = preprocess_test(X_test_nump, lst_vars, stats_dict)

Iterating feature: dia
median:15.0, f_min:1.0, f_max:31.0
Iterating feature: mes
median:7.0, f_min:1.0, f_max:12.0
Iterating feature: km_ovsd
median:1229.56, f_min:172.47, f_max:11901.66
Iterating feature: t_media
median:16.0, f_min:4.0, f_max:30.0
Iterating feature: v_media_viento
median:6.2, f_min:0.9, f_max:13.9
Iterating feature: presion_media
median:1017.0, f_min:1009.6, f_max:1029.6
Iterating feature: cantidad_de_lluvia_mm
median:2.0, f_min:0.0, f_max:82.4
Iterating feature: nubosidad_perc
median:38.2, f_min:9.7, f_max:59.9
Iterating feature: temporada_alta
median:0.0, f_min:0.0, f_max:1.0


In [299]:
X_test_nump

array([[0.7       , 0.36363636, 0.02285324, ..., 0.46359223, 0.85059761,
        0.        ],
       [0.06666667, 0.36363636, 0.17242367, ..., 0.46359223, 0.85059761,
        0.        ],
       [0.3       , 0.63636364, 0.06363184, ..., 0.26456311, 0.95418327,
        0.        ],
       ...,
       [0.66666667, 0.63636364, 0.09012472, ..., 0.26456311, 0.95418327,
        0.        ],
       [0.76666667, 0.90909091, 0.19559322, ..., 0.02427184, 0.39840637,
        0.        ],
       [0.2       , 0.54545455, 0.04919351, ..., 0.68932039, 1.        ,
        0.        ]])

In [301]:
X_test_nump_proc

array([[0.7       , 0.36363636, 0.02285324, ..., 0.46359223, 0.85059761,
        0.        ],
       [0.06666667, 0.36363636, 0.17242367, ..., 0.46359223, 0.85059761,
        0.        ],
       [0.3       , 0.63636364, 0.06363184, ..., 0.26456311, 0.95418327,
        0.        ],
       ...,
       [0.66666667, 0.63636364, 0.09012472, ..., 0.26456311, 0.95418327,
        0.        ],
       [0.76666667, 0.90909091, 0.19559322, ..., 0.02427184, 0.39840637,
        0.        ],
       [0.2       , 0.54545455, 0.04919351, ..., 0.68932039, 1.        ,
        0.        ]])

In [302]:
test_preds = gbc.predict(X_test_nump_proc)

In [303]:
X_test_nump_proc.shape

(20462, 9)

In [304]:
test_labels = y_test.to_numpy().flatten().copy()
test_labels

array([0., 1., 0., ..., 0., 0., 0.])

In [305]:
test_labels.sum()

3783.0

In [306]:
test_labels.shape

(20462,)

In [307]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score

# assume y_true and y_pred are your true labels and predicted labels, respectively
# y_pred should be the predicted probability of class 1 for each sample

acu = accuracy_score(test_labels, test_preds)
# calculate AUC
auc = roc_auc_score(test_labels, test_preds)
# calculate precision
precision = precision_score(test_labels, test_preds)
# calculate recall
recall = recall_score(test_labels, test_preds)
# calculate F1-score
f1 = f1_score(test_labels, test_preds)

print(f"Accuracy: {acu:.3f}")
print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

Accuracy: 0.605
AUC: 0.617
Precision: 0.264
Recall: 0.636
F1-score: 0.373


In [309]:
# # save a sample to test the api later

import json

# write the dictionary to a JSON file
with open('../ML-FASTAPI-DOCKER-HEROKU/stats_dict.json', "w") as f:
    json.dump(stats_dict, f)

In [310]:
import pickle

In [315]:
with open('../models/gbc_numpy_processing.pkl', 'wb') as file:
    pickle.dump(gbc, file)

In [316]:
with open('../models/gbc_numpy_processing.pkl', 'rb') as f:
    model = pickle.load(f)

In [317]:
model

## Prueba prod
En esta seccion hicimos pruebas manuales para asegurarnos de que el codigo que ibamos a usar para el modulo de la api era correcto e iba a correr sin errores

In [312]:
X_train_resampled

Unnamed: 0,dia,mes,km_ovsd,t_media,v_media_viento,presion_media,cantidad_de_lluvia_mm,nubosidad_perc,temporada_alta
0,4.0,7.0,2466.62,8.0,9.1,1022.0,56.8,59.9,0.0
1,25.0,9.0,1086.84,18.0,6.0,1019.4,13.7,53.5,1.0
2,16.0,1.0,918.82,26.0,9.1,1016.0,0.0,17.6,1.0
3,13.0,1.0,1229.56,30.0,9.5,1014.2,0.0,17.6,1.0
4,26.0,8.0,172.47,11.0,5.3,1019.7,21.8,57.6,0.0
...,...,...,...,...,...,...,...,...,...
17657,19.0,9.0,172.47,11.0,5.6,1020.8,13.7,53.5,1.0
17658,8.0,5.0,2466.62,15.0,3.7,1017.9,38.2,52.4,0.0
17659,1.0,11.0,4252.23,14.0,3.6,1016.6,2.0,29.7,0.0
17660,4.0,12.0,895.71,,,,1.5,27.5,0.0


In [139]:
X_train_resampled.iloc[17660,:].to_numpy().ndim

1

In [140]:
sample_item = X_train_resampled.iloc[17660,:].to_numpy()
sample_item

array([  4.  ,  12.  , 895.71,    nan,    nan,    nan,   1.5 ,  27.5 ,
         0.  ])

In [216]:
sample_item = {
    "dia":np.nan
    ,"mes":9.00
    ,"km_ovsd":1086.84
    ,"t_media":21.00
    ,"v_media_viento":6.10
    ,"presion_media":1015.50
    ,"cantidad_de_lluvia_mm":13.70
    ,"nubosidad_perc":np.nan
    ,"temporada_alta":1.00         
}

In [217]:
sample_item_array = np.array(list(sample_item.values())).reshape(1, -1)
sample_item_array

array([[        nan, 9.00000e+00, 1.08684e+03, 2.10000e+01, 6.10000e+00,
        1.01550e+03, 1.37000e+01,         nan, 1.00000e+00]])

In [191]:
sample_item_array[0][0]

nan

In [192]:
# (16.0 - 1.0) / (31.0 - 1.0)

In [193]:
sample_item_array.ndim

2

In [195]:
lst_vars

['dia',
 'mes',
 'km_ovsd',
 't_media',
 'v_media_viento',
 'presion_media',
 'cantidad_de_lluvia_mm',
 'nubosidad_perc',
 'temporada_alta']

In [196]:
stats_dict

{'dia': {'median': 15.0, 'min': 1.0, 'max': 31.0},
 'mes': {'median': 7.0, 'min': 1.0, 'max': 12.0},
 'km_ovsd': {'median': 1229.56, 'min': 172.47, 'max': 11901.66},
 't_media': {'median': 16.0, 'min': 4.0, 'max': 30.0},
 'v_media_viento': {'median': 6.2, 'min': 0.9, 'max': 13.9},
 'presion_media': {'median': 1017.0, 'min': 1009.6, 'max': 1029.6},
 'cantidad_de_lluvia_mm': {'median': 2.0, 'min': 0.0, 'max': 82.4},
 'nubosidad_perc': {'median': 38.2, 'min': 9.7, 'max': 59.9},
 'temporada_alta': {'median': 0.0, 'min': 0.0, 'max': 1.0}}

In [197]:
sample_item_array

array([[        nan, 9.00000e+00, 1.08684e+03, 2.10000e+01, 6.10000e+00,
        1.01550e+03, 1.37000e+01, 5.35000e+01, 1.00000e+00]])

In [198]:
# sample_item_array_processed = preprocess_test(sample_item_array, lst_vars, stats_dict)

In [178]:
median = stats_dict['dia']["median"]
f_min = stats_dict['dia']["min"]
f_max = stats_dict['dia']["max"]

In [179]:
median, f_min, f_max

(15.0, 1.0, 31.0)

In [181]:
imputer = lambda x, median: np.where(np.isnan(x), median, x)
scaler = lambda x, f_min, f_max: (x - f_min) / (f_max - f_min)

In [182]:
sample_item_array[:, 0]

array([nan])

In [183]:
sample_item_array[:, 0] = imputer(sample_item_array[:, 0], median)
sample_item_array[:, 0]

array([15.])

In [186]:
sample_item_array[:, 0] = scaler(sample_item_array[:, 0], f_min, f_max)
sample_item_array[:, 0]

array([0.46666667])

In [187]:
(15 - 1) / (31- 1)

0.4666666666666667

In [199]:
def processing_payload(payload, lst_vars):
    
    for i, feature in enumerate(lst_vars):
        
        print(f'Iterating feature: {feature}')

        #for j in range(train_data.shape[1]):
        median = stats_dict[feature]["median"]
        f_min = stats_dict[feature]["min"]
        f_max = stats_dict[feature]["max"]
        
        print(f'median:{median}, f_min:{f_min}, f_max:{f_max}')
        
        payload[:, i] = imputer(payload[:, i], median)
        print(payload[:, i])
        
        payload[:, i] = scaler(payload[:, i], f_min, f_max)
        print(payload[:, i])
        
    return payload

In [200]:
sample_item_array

array([[        nan, 9.00000e+00, 1.08684e+03, 2.10000e+01, 6.10000e+00,
        1.01550e+03, 1.37000e+01, 5.35000e+01, 1.00000e+00]])

In [201]:
lst_vars

['dia',
 'mes',
 'km_ovsd',
 't_media',
 'v_media_viento',
 'presion_media',
 'cantidad_de_lluvia_mm',
 'nubosidad_perc',
 'temporada_alta']

In [202]:
sample_item_array_processed = processing_payload(payload=sample_item_array, lst_vars=lst_vars)
sample_item_array_processed

Iterating feature: dia
median:15.0, f_min:1.0, f_max:31.0
[15.]
[0.46666667]
Iterating feature: mes
median:7.0, f_min:1.0, f_max:12.0
[9.]
[0.72727273]
Iterating feature: km_ovsd
median:1229.56, f_min:172.47, f_max:11901.66
[1086.84]
[0.07795679]
Iterating feature: t_media
median:16.0, f_min:4.0, f_max:30.0
[21.]
[0.65384615]
Iterating feature: v_media_viento
median:6.2, f_min:0.9, f_max:13.9
[6.1]
[0.4]
Iterating feature: presion_media
median:1017.0, f_min:1009.6, f_max:1029.6
[1015.5]
[0.295]
Iterating feature: cantidad_de_lluvia_mm
median:2.0, f_min:0.0, f_max:82.4
[13.7]
[0.16626214]
Iterating feature: nubosidad_perc
median:38.2, f_min:9.7, f_max:59.9
[53.5]
[0.87250996]
Iterating feature: temporada_alta
median:0.0, f_min:0.0, f_max:1.0
[1.]
[1.]


array([[0.46666667, 0.72727273, 0.07795679, 0.65384615, 0.4       ,
        0.295     , 0.16626214, 0.87250996, 1.        ]])

In [203]:
sample_item_array_processed

array([[0.46666667, 0.72727273, 0.07795679, 0.65384615, 0.4       ,
        0.295     , 0.16626214, 0.87250996, 1.        ]])

In [205]:
lr.predict(sample_item_array_processed)

array([1.])

In [None]:
def processing_payload(payload, lst_vars):
    
    payload_processed = payload.copy()

    medians = np.array([stats_dict[feature]["median"] for feature in lst_vars])
    f_mins = np.array([stats_dict[feature]["min"] for feature in lst_vars])
    f_maxs = np.array([stats_dict[feature]["max"] for feature in lst_vars])

    payload_processed = imputer(payload_processed, medians)
    payload_processed = scaler(payload_processed, f_mins, f_maxs)

    return payload_processed

In [206]:
medians = np.array([stats_dict[feature]["median"] for feature in lst_vars])
f_mins = np.array([stats_dict[feature]["min"] for feature in lst_vars])
f_maxs = np.array([stats_dict[feature]["max"] for feature in lst_vars])

In [207]:
medians

array([  15.  ,    7.  , 1229.56,   16.  ,    6.2 , 1017.  ,    2.  ,
         38.2 ,    0.  ])

In [208]:
f_mins

array([1.0000e+00, 1.0000e+00, 1.7247e+02, 4.0000e+00, 9.0000e-01,
       1.0096e+03, 0.0000e+00, 9.7000e+00, 0.0000e+00])

In [209]:
f_maxs

array([3.100000e+01, 1.200000e+01, 1.190166e+04, 3.000000e+01,
       1.390000e+01, 1.029600e+03, 8.240000e+01, 5.990000e+01,
       1.000000e+00])

In [220]:
sample_item_array

array([[        nan, 9.00000e+00, 1.08684e+03, 2.10000e+01, 6.10000e+00,
        1.01550e+03, 1.37000e+01,         nan, 1.00000e+00]])

In [219]:
sample_item_array_processed = imputer(sample_item_array, medians).copy()
sample_item_array_processed

array([[1.50000e+01, 9.00000e+00, 1.08684e+03, 2.10000e+01, 6.10000e+00,
        1.01550e+03, 1.37000e+01, 3.82000e+01, 1.00000e+00]])

In [222]:
sample_item_array_processed_norm = scaler(sample_item_array_processed, f_mins, f_maxs).copy()
sample_item_array_processed_norm

array([[0.46666667, 0.72727273, 0.07795679, 0.65384615, 0.4       ,
        0.295     , 0.16626214, 0.56772908, 1.        ]])

In [221]:
(15 -1)/(31-1)

0.4666666666666667

---

Fin