In [1]:
#Import Import libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import RobustScaler

from src.utils.split_data import train_val_test_split, split_features, remove_label

In [2]:
#Load the dataset
DATASET_PATH = '../../../data/pre-processed/data.csv'
data = pd.read_csv(DATASET_PATH)

In [3]:
data

Unnamed: 0,FE_OCURRENCIA,FRECUENCIA,HORA_OCURRENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,MES,DIA_SEMANA,ESTACION_A,FERIADO,NIVEL
0,2014-07-05,1,16:00,1,0,3.0,Julio,Sábado,Verano,False,bajo
1,2014-07-08,1,10:30,1,0,2.0,Julio,Martes,Verano,False,bajo
2,2014-07-11,1,10:00,1,0,4.0,Julio,Viernes,Verano,False,bajo
3,2014-07-12,1,22:00,1,0,2.0,Julio,Sábado,Verano,False,bajo
4,2014-07-14,1,10:00,1,0,5.0,Julio,Lunes,Verano,False,bajo
...,...,...,...,...,...,...,...,...,...,...,...
3113,2023-06-30,2,12:54,1,0,2.0,Junio,Viernes,Verano,False,bajo
3114,2023-07-01,3,12:54,1,0,2.0,Julio,Sábado,Verano,False,moderado
3115,2023-07-02,2,12:54,1,0,2.0,Julio,Domingo,Verano,False,bajo
3116,2023-07-03,4,12:54,1,0,2.0,Julio,Lunes,Verano,False,moderado


In [4]:
data['NIVEL'] = data['NIVEL'].map({'bajo': 1, 'moderado': 2, 'alto': 3})

In [5]:
data['NIVEL'].value_counts()

3    1671
2    1036
1     411
Name: NIVEL, dtype: int64

In [6]:
data['FERIADO'] = data['FERIADO'].map({False: 0, True: 1})

In [7]:
data['FERIADO'].value_counts()

0    3009
1     109
Name: FERIADO, dtype: int64

In [8]:
data['ESTACION_A'].unique()

array(['Verano', 'Otoño', 'Invierno', 'Primavera'], dtype=object)

In [9]:
data['ESTACION_A'] = data['ESTACION_A'].map({'Primavera': 1, 'Verano': 2, "Otoño" : 3, "Invierno": 4})

In [10]:
data['ESTACION_A'].value_counts()

1    792
4    785
2    775
3    766
Name: ESTACION_A, dtype: int64

In [11]:
data['CAT_VEHICULO'].unique()

array([3., 2., 4., 5.])

In [12]:
week_day_en = pd.get_dummies(data['DIA_SEMANA'], prefix='DIA_SEMANA')

In [13]:
week_day_en

Unnamed: 0,DIA_SEMANA_Domingo,DIA_SEMANA_Jueves,DIA_SEMANA_Lunes,DIA_SEMANA_Martes,DIA_SEMANA_Miércoles,DIA_SEMANA_Sábado,DIA_SEMANA_Viernes
0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1
3,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
3113,0,0,0,0,0,0,1
3114,0,0,0,0,0,1,0
3115,1,0,0,0,0,0,0
3116,0,0,1,0,0,0,0


In [14]:
data

Unnamed: 0,FE_OCURRENCIA,FRECUENCIA,HORA_OCURRENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,MES,DIA_SEMANA,ESTACION_A,FERIADO,NIVEL
0,2014-07-05,1,16:00,1,0,3.0,Julio,Sábado,2,0,1
1,2014-07-08,1,10:30,1,0,2.0,Julio,Martes,2,0,1
2,2014-07-11,1,10:00,1,0,4.0,Julio,Viernes,2,0,1
3,2014-07-12,1,22:00,1,0,2.0,Julio,Sábado,2,0,1
4,2014-07-14,1,10:00,1,0,5.0,Julio,Lunes,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...
3113,2023-06-30,2,12:54,1,0,2.0,Junio,Viernes,2,0,1
3114,2023-07-01,3,12:54,1,0,2.0,Julio,Sábado,2,0,2
3115,2023-07-02,2,12:54,1,0,2.0,Julio,Domingo,2,0,1
3116,2023-07-03,4,12:54,1,0,2.0,Julio,Lunes,2,0,2


In [15]:
data = pd.concat([data, week_day_en], axis=1)

In [16]:
data

Unnamed: 0,FE_OCURRENCIA,FRECUENCIA,HORA_OCURRENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,MES,DIA_SEMANA,ESTACION_A,FERIADO,NIVEL,DIA_SEMANA_Domingo,DIA_SEMANA_Jueves,DIA_SEMANA_Lunes,DIA_SEMANA_Martes,DIA_SEMANA_Miércoles,DIA_SEMANA_Sábado,DIA_SEMANA_Viernes
0,2014-07-05,1,16:00,1,0,3.0,Julio,Sábado,2,0,1,0,0,0,0,0,1,0
1,2014-07-08,1,10:30,1,0,2.0,Julio,Martes,2,0,1,0,0,0,1,0,0,0
2,2014-07-11,1,10:00,1,0,4.0,Julio,Viernes,2,0,1,0,0,0,0,0,0,1
3,2014-07-12,1,22:00,1,0,2.0,Julio,Sábado,2,0,1,0,0,0,0,0,1,0
4,2014-07-14,1,10:00,1,0,5.0,Julio,Lunes,2,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113,2023-06-30,2,12:54,1,0,2.0,Junio,Viernes,2,0,1,0,0,0,0,0,0,1
3114,2023-07-01,3,12:54,1,0,2.0,Julio,Sábado,2,0,2,0,0,0,0,0,1,0
3115,2023-07-02,2,12:54,1,0,2.0,Julio,Domingo,2,0,1,1,0,0,0,0,0,0
3116,2023-07-03,4,12:54,1,0,2.0,Julio,Lunes,2,0,2,0,0,1,0,0,0,0


In [17]:
month_en = pd.get_dummies(data['MES'], prefix='MES')

In [18]:
data = pd.concat([data, month_en], axis=1)

In [19]:
data

Unnamed: 0,FE_OCURRENCIA,FRECUENCIA,HORA_OCURRENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,MES,DIA_SEMANA,ESTACION_A,FERIADO,...,MES_Diciembre,MES_Enero,MES_Febrero,MES_Julio,MES_Junio,MES_Marzo,MES_Mayo,MES_Noviembre,MES_Octubre,MES_Septiembre
0,2014-07-05,1,16:00,1,0,3.0,Julio,Sábado,2,0,...,0,0,0,1,0,0,0,0,0,0
1,2014-07-08,1,10:30,1,0,2.0,Julio,Martes,2,0,...,0,0,0,1,0,0,0,0,0,0
2,2014-07-11,1,10:00,1,0,4.0,Julio,Viernes,2,0,...,0,0,0,1,0,0,0,0,0,0
3,2014-07-12,1,22:00,1,0,2.0,Julio,Sábado,2,0,...,0,0,0,1,0,0,0,0,0,0
4,2014-07-14,1,10:00,1,0,5.0,Julio,Lunes,2,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113,2023-06-30,2,12:54,1,0,2.0,Junio,Viernes,2,0,...,0,0,0,0,1,0,0,0,0,0
3114,2023-07-01,3,12:54,1,0,2.0,Julio,Sábado,2,0,...,0,0,0,1,0,0,0,0,0,0
3115,2023-07-02,2,12:54,1,0,2.0,Julio,Domingo,2,0,...,0,0,0,1,0,0,0,0,0,0
3116,2023-07-03,4,12:54,1,0,2.0,Julio,Lunes,2,0,...,0,0,0,1,0,0,0,0,0,0


In [20]:
data['HORA_OCURRENCIA'] = pd.to_datetime(data['HORA_OCURRENCIA'], format='%H:%M')

In [21]:
data['HORA'] = data['HORA_OCURRENCIA'].dt.hour
data['MINUTO'] = data['HORA_OCURRENCIA'].dt.minute

In [22]:
data

Unnamed: 0,FE_OCURRENCIA,FRECUENCIA,HORA_OCURRENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,MES,DIA_SEMANA,ESTACION_A,FERIADO,...,MES_Febrero,MES_Julio,MES_Junio,MES_Marzo,MES_Mayo,MES_Noviembre,MES_Octubre,MES_Septiembre,HORA,MINUTO
0,2014-07-05,1,1900-01-01 16:00:00,1,0,3.0,Julio,Sábado,2,0,...,0,1,0,0,0,0,0,0,16,0
1,2014-07-08,1,1900-01-01 10:30:00,1,0,2.0,Julio,Martes,2,0,...,0,1,0,0,0,0,0,0,10,30
2,2014-07-11,1,1900-01-01 10:00:00,1,0,4.0,Julio,Viernes,2,0,...,0,1,0,0,0,0,0,0,10,0
3,2014-07-12,1,1900-01-01 22:00:00,1,0,2.0,Julio,Sábado,2,0,...,0,1,0,0,0,0,0,0,22,0
4,2014-07-14,1,1900-01-01 10:00:00,1,0,5.0,Julio,Lunes,2,0,...,0,1,0,0,0,0,0,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113,2023-06-30,2,1900-01-01 12:54:00,1,0,2.0,Junio,Viernes,2,0,...,0,0,1,0,0,0,0,0,12,54
3114,2023-07-01,3,1900-01-01 12:54:00,1,0,2.0,Julio,Sábado,2,0,...,0,1,0,0,0,0,0,0,12,54
3115,2023-07-02,2,1900-01-01 12:54:00,1,0,2.0,Julio,Domingo,2,0,...,0,1,0,0,0,0,0,0,12,54
3116,2023-07-03,4,1900-01-01 12:54:00,1,0,2.0,Julio,Lunes,2,0,...,0,1,0,0,0,0,0,0,12,54


In [23]:
data = data.drop("FE_OCURRENCIA", axis=1)
data = data.drop("HORA_OCURRENCIA", axis=1)
data = data.drop("MES", axis=1)
data = data.drop("DIA_SEMANA", axis=1)

In [24]:
data

Unnamed: 0,FRECUENCIA,CO_PROVINCIA,EXTRANJERO,CAT_VEHICULO,ESTACION_A,FERIADO,NIVEL,DIA_SEMANA_Domingo,DIA_SEMANA_Jueves,DIA_SEMANA_Lunes,...,MES_Febrero,MES_Julio,MES_Junio,MES_Marzo,MES_Mayo,MES_Noviembre,MES_Octubre,MES_Septiembre,HORA,MINUTO
0,1,1,0,3.0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,16,0
1,1,1,0,2.0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,10,30
2,1,1,0,4.0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,10,0
3,1,1,0,2.0,2,0,1,0,0,0,...,0,1,0,0,0,0,0,0,22,0
4,1,1,0,5.0,2,0,1,0,0,1,...,0,1,0,0,0,0,0,0,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3113,2,1,0,2.0,2,0,1,0,0,0,...,0,0,1,0,0,0,0,0,12,54
3114,3,1,0,2.0,2,0,2,0,0,0,...,0,1,0,0,0,0,0,0,12,54
3115,2,1,0,2.0,2,0,1,1,0,0,...,0,1,0,0,0,0,0,0,12,54
3116,4,1,0,2.0,2,0,2,0,0,1,...,0,1,0,0,0,0,0,0,12,54


In [25]:
#Split the train, val, and test set
train_set, val_set, test_set = train_val_test_split(data)

In [26]:
#Split the inputs and labels for regression dataset
X_train_reg, y_train_reg = split_features(train_set, "FRECUENCIA")
X_val_reg, y_val_reg = split_features(val_set, "FRECUENCIA")
X_test_reg, y_test_reg = split_features(test_set, "FRECUENCIA")

In [27]:
#Split the inputs and labels for regression dataset
X_train_cla, y_train_cla = split_features(train_set, "NIVEL")
X_val_cla, y_val_cla = split_features(val_set, "NIVEL")
X_test_cla, y_test_cla = split_features(test_set, "NIVEL")

In [28]:
X_train_reg = remove_label(X_train_reg, 'NIVEL')
X_val_reg = remove_label(X_val_reg, 'NIVEL')
X_test_reg = remove_label(X_test_reg, 'NIVEL')

In [29]:
X_train_cla = remove_label(X_train_cla, 'FRECUENCIA')
X_val_cla = remove_label(X_val_cla, 'FRECUENCIA')
X_test_cla = remove_label(X_test_cla, 'FRECUENCIA')

In [35]:
y_test_cla.unique()

array([3, 2, 1])

In [30]:
# The reg dataset is scaled
scaler_reg = RobustScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg.copy())
X_val_reg_scaled = scaler_reg.fit_transform(X_val_reg.copy())
X_test_reg_scaled = scaler_reg.fit_transform(X_test_reg.copy())

# Transform the result to a Pandas DataFrame
X_train_reg_scaled = pd.DataFrame(X_train_reg_scaled, columns=X_train_reg.columns, index=y_train_reg.index)
X_val_reg_scaled = pd.DataFrame(X_val_reg_scaled, columns=X_val_reg.columns, index=y_val_reg.index)
X_test_reg_scaled = pd.DataFrame(X_test_reg_scaled, columns=X_test_reg.columns, index=y_test_reg.index)

In [31]:
# The cla dataset is scaled
scaler_cla = RobustScaler()
X_train_cla_scaled = scaler_cla.fit_transform(X_train_cla.copy())
X_val_cla_scaled = scaler_cla.fit_transform(X_val_cla.copy())
X_test_cla_scaled = scaler_cla.fit_transform(X_test_cla.copy())

# Transform the result to a Pandas DataFrame
X_train_cla_scaled = pd.DataFrame(X_train_cla_scaled, columns=X_train_cla.columns, index=y_train_cla.index)
X_val_cla_scaled = pd.DataFrame(X_val_cla_scaled, columns=X_val_cla.columns, index=y_val_cla.index)
X_test_cla_scaled = pd.DataFrame(X_test_cla_scaled, columns=X_test_cla.columns, index=y_test_cla.index)

In [36]:
#Save the data
SAVE_DATA_PATH = "../../../data"

#Regression data
np.savez(SAVE_DATA_PATH + '/processed/reg/train_data_reg', inputs=X_train_reg, targets=y_train_reg)
np.savez(SAVE_DATA_PATH + '/processed/reg/val_data_reg', inputs=X_val_reg, targets=y_val_reg)
np.savez(SAVE_DATA_PATH + '/processed/reg/test_data_reg', inputs=X_test_reg, targets=y_test_reg)

np.savez(SAVE_DATA_PATH + '/processed/reg/train_data_reg_scaled', inputs=X_train_reg_scaled, targets=y_train_reg)
np.savez(SAVE_DATA_PATH + '/processed/reg/val_data_reg_scaled', inputs=X_val_reg_scaled, targets=y_val_reg)
np.savez(SAVE_DATA_PATH + '/processed/reg/test_data_reg_scaled', inputs=X_test_reg_scaled, targets=y_test_reg)

#Clasification data
np.savez(SAVE_DATA_PATH + '/processed/cla/train_data_cla', inputs=X_train_cla, targets=y_train_cla)
np.savez(SAVE_DATA_PATH + '/processed/cla/val_data_cla', inputs=X_val_cla, targets=y_val_cla)
np.savez(SAVE_DATA_PATH + '/processed/cla/test_data_cla', inputs=X_test_cla, targets=y_test_cla)

np.savez(SAVE_DATA_PATH + '/processed/cla/train_data_cla_scaled', inputs=X_train_cla_scaled, targets=y_train_cla)
np.savez(SAVE_DATA_PATH + '/processed/cla/val_data_cla_scaled', inputs=X_val_cla_scaled, targets=y_val_cla)
np.savez(SAVE_DATA_PATH + '/processed/cla/test_data_cla_scaled', inputs=X_test_cla_scaled, targets=y_test_cla)

In [37]:
EXPORT_SAVE_PATH = '../../../exports'

#Save the vectorizer
with open(EXPORT_SAVE_PATH + '/encoders/week_day_en.pkl', 'wb') as f:
    pickle.dump(week_day_en, f)
with open(EXPORT_SAVE_PATH + '/encoders/month_en.pkl', 'wb') as f:
    pickle.dump(month_en, f)

#Save the scaler
with open(EXPORT_SAVE_PATH + '/scaler/scaler_cla.pkl', 'wb') as f:
    pickle.dump(scaler_cla, f)
with open(EXPORT_SAVE_PATH + '/scaler/scaler_reg.pkl', 'wb') as f:
    pickle.dump(scaler_reg, f)