In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from joblib import dump

In [2]:
df_raw = pd.read_csv("../2.3 csv_unique/df_unified.csv")
df_raw.head(3)

Unnamed: 0,names,comunity,scores,dates,co,humidity,no2,o3,p,pm10,pm25,so2,temp,wind,fecha_convertida,Dia,Hora,periodo,calidad_aire
0,"3 De Marzo, Vitoria-Gasteiz, País Vasco, Spain",Pais_Vasco,55,2024-06-26 05:29:00+09:00,0.1,85.0,8.3,,1011.5,17.0,55.0,0.6,18.5,2.5,2024-06-25 22:29:00+09:00,2024-06-25,22,noche,Moderada
1,"3 De Marzo, Vitoria-Gasteiz, País Vasco, Spain",Pais_Vasco,53,2024-06-26 15:42:37+09:00,0.1,100.0,7.4,,1012.0,20.0,53.0,0.6,17.5,1.0,2024-06-26 08:42:37+09:00,2024-06-26,8,mañana,Moderada
2,"3 De Marzo, Vitoria-Gasteiz, País Vasco, Spain",Pais_Vasco,53,2024-06-26 18:45:19+09:00,0.1,77.5,12.4,,1012.0,26.0,53.0,0.6,22.0,0.7,2024-06-26 11:45:19+09:00,2024-06-26,11,mañana,Moderada


In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2028 entries, 0 to 2027
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   names             2028 non-null   object 
 1   comunity          1930 non-null   object 
 2   scores            2028 non-null   object 
 3   dates             2028 non-null   object 
 4   co                920 non-null    float64
 5   humidity          2028 non-null   float64
 6   no2               2016 non-null   float64
 7   o3                1624 non-null   float64
 8   p                 2010 non-null   float64
 9   pm10              1898 non-null   float64
 10  pm25              1706 non-null   float64
 11  so2               1477 non-null   float64
 12  temp              2020 non-null   float64
 13  wind              1974 non-null   float64
 14  fecha_convertida  2028 non-null   object 
 15  Dia               2028 non-null   object 
 16  Hora              2028 non-null   int64  


In [4]:
df_raw['dates'] = pd.to_datetime(df_raw['dates'], errors='coerce')
df_raw['Dia'] = pd.to_datetime(df_raw['Dia'])
df_raw['scores'] = pd.to_numeric(df_raw['scores'], errors='coerce')

In [5]:
# Definir cols categoricas y numericas
categorical_features = ['names', 'comunity', 'periodo', 'calidad_aire']
numeric_features = df_raw.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [6]:
df_raw.describe()

Unnamed: 0,scores,co,humidity,no2,o3,p,pm10,pm25,so2,temp,wind,Dia,Hora
count,2012.0,920.0,2028.0,2016.0,1624.0,2010.0,1898.0,1706.0,1477.0,2020.0,1974.0,2028,2028.0
mean,34.19831,0.895217,65.766371,5.913591,26.787869,1013.130249,15.623815,30.849941,1.639472,22.201683,4.033891,2024-06-25 18:19:52.899408384,12.660256
min,1.0,0.1,21.5,0.3,0.5,981.8,1.0,5.0,0.6,3.8,0.1,2024-02-19 00:00:00,1.0
25%,24.0,0.1,52.6,2.3,20.1,1011.5,10.0,17.0,1.0,18.875,1.5,2024-06-26 00:00:00,8.75
50%,33.0,0.1,64.6,4.2,26.5,1013.4,14.0,25.0,1.6,22.2,3.1,2024-06-27 00:00:00,12.0
75%,42.0,1.1,81.6,7.7,33.8,1015.1,19.0,42.0,2.1,25.3,6.0,2024-06-29 00:00:00,16.0
max,197.0,4.7,100.0,38.0,63.6,1033.5,69.0,197.0,20.3,35.0,21.3,2024-06-30 00:00:00,22.0
std,15.267055,1.282959,18.413848,5.478718,9.507489,3.999736,10.043275,18.228295,1.101203,4.697284,3.324867,,5.670744


In [7]:
#  !!!!
# COMPPROBAR COMO QUEREMOS DEJAR LOS NANs
df_raw[numeric_features] = df_raw[numeric_features].fillna(df_raw[numeric_features].mean())

In [8]:
# Reducir MemoryUsage
df_raw['scores'] = df_raw['scores'].astype(np.int16)
df_raw['co'] = df_raw['co'].astype(np.float32)
df_raw['humidity'] = df_raw['humidity'].astype(np.float32)
df_raw['no2'] = df_raw['no2'].astype(np.float32)
df_raw['o3'] = df_raw['o3'].astype(np.float32)
df_raw['p'] = df_raw['p'].astype(np.float32)
df_raw['pm10'] = df_raw['pm10'].astype(np.float32)
df_raw['pm25'] = df_raw['pm25'].astype(np.float32)
df_raw['so2'] = df_raw['so2'].astype(np.float32)
df_raw['temp'] = df_raw['temp'].astype(np.float32)
df_raw['wind'] = df_raw['wind'].astype(np.float32)
df_raw['Hora'] = df_raw['Hora'].astype(np.int8)

In [9]:
for col in categorical_features:
    df_raw[col] = df_raw[col].astype('category')

In [10]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit(df_raw[categorical_features])

In [11]:
imputer = IterativeImputer()
imputed_nums = imputer.fit(df_raw[numeric_features])

In [12]:
dump(imputer, '../4.2 models_prepro/imputer.joblib')
print("Imputer guardados en '4.2 models_prepro'")

Imputer guardados en '4.2 models_prepro'


In [13]:
imputer_nums_df = pd.DataFrame(imputed_nums.transform(df_raw[numeric_features]), columns=numeric_features)

In [18]:
scaler = StandardScaler()
scaled_nums = scaler.fit(imputer_nums_df)

In [19]:
dump(encoder, '../4.2 models_prepro/encoder.joblib')
dump(scaler, '../4.2 models_prepro/scaler.joblib')
print("Encoder, Scaler e Imputer guardados en '4.2 models_prepro'")

Encoder, Scaler e Imputer guardados en '4.2 models_prepro'


In [20]:
encoded_cats_df = pd.DataFrame(encoded_cats.transform(df_raw[categorical_features]), columns=encoder.get_feature_names_out(categorical_features))

In [21]:
scaled_nums_df = pd.DataFrame(scaled_nums.transform(df_raw[numeric_features]), columns=numeric_features)

In [22]:
df_preprocessed = pd.concat([scaled_nums_df, encoded_cats_df], axis=1)

In [23]:
df_preprocessed

Unnamed: 0,scores,co,humidity,no2,o3,p,pm10,pm25,so2,temp,...,periodo_mañana,periodo_mediodia,periodo_noche,periodo_otro,periodo_tarde,calidad_aire_-,calidad_aire_Buena,calidad_aire_Dañina a la Salud,calidad_aire_Dañina a la Salud de los Grupos Sensitivos,calidad_aire_Moderada
0,1.368370,-9.207661e-01,1.044777,0.436981,9.195301e-08,-0.409512,1.416778e-01,1.444920,-1.106461e+00,-0.789802,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.236817,-9.207661e-01,1.859583,0.272180,9.195301e-08,-0.283914,4.505268e-01,1.325259,-1.106461e+00,-1.003165,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.236817,-9.207661e-01,0.637375,1.187741,9.195301e-08,-0.283914,1.068225e+00,1.325259,-1.106461e+00,-0.043032,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.513273,-9.207661e-01,-1.290998,-0.405336,9.195301e-08,-0.660708,-2.701209e-01,0.667118,-1.106461e+00,1.663872,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.236817,-9.207661e-01,1.207739,-0.148979,9.195301e-08,-0.032718,-6.422150e-02,1.325259,-1.106461e+00,-0.469758,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,-0.341825,1.408886e-08,0.642807,-0.570137,2.953595e-01,1.147906,3.969821e-09,-1.307302,-1.970952e-09,-0.683121,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2024,-0.933816,1.408886e-08,0.919840,-0.313780,-8.568592e-01,0.871581,3.969821e-09,-1.307302,-1.970952e-09,-1.152519,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2025,0.052835,1.408886e-08,-0.411008,-0.661693,1.012557e+00,0.997179,3.969821e-09,0.188471,-1.970952e-09,0.405031,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2026,0.052835,1.408886e-08,-0.095950,-0.570137,1.012557e+00,1.097664,3.969821e-09,-0.828655,-1.970952e-09,-0.128377,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
