In [24]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer


weather = pd.read_csv('./dataset/clean-dataset.csv')
rows, columns = weather.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 142146, Columns: 25


### Conversion de variables

In [25]:
weather.head(5)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,DailyTempRange,YearMonth
0,2008-07-02,Adelaide,12.7,15.8,0.8,1.4,7.8,SW,35.0,SSW,...,1022.4,1022.6,,,13.7,15.5,0.0,0.0,3.1,2008-07
1,2008-07-03,Adelaide,6.2,15.1,0.0,1.8,2.1,W,20.0,NNE,...,1027.8,1026.5,,,9.3,13.9,0.0,0.0,8.9,2008-07
2,2008-07-04,Adelaide,5.3,15.9,0.0,1.4,8.0,NNE,30.0,NNE,...,1028.7,1025.6,,,10.2,15.3,0.0,0.0,10.6,2008-07
3,2008-07-06,Adelaide,11.3,15.7,0.0,1.4,1.5,NNW,52.0,NNE,...,1019.5,1016.2,,,13.0,14.4,0.0,1.0,4.4,2008-07
4,2008-07-07,Adelaide,7.6,11.2,16.2,4.6,1.1,WSW,46.0,WNW,...,1015.9,1017.9,,,9.8,9.3,1.0,1.0,3.6,2008-07


In [26]:
weather.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142146 entries, 0 to 142145
Data columns (total 25 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Date            142146 non-null  object 
 1   Location        142146 non-null  object 
 2   MinTemp         142146 non-null  float64
 3   MaxTemp         142146 non-null  float64
 4   Rainfall        142146 non-null  float64
 5   Evaporation     142146 non-null  float64
 6   Sunshine        142146 non-null  float64
 7   WindGustDir     132820 non-null  object 
 8   WindGustSpeed   142146 non-null  float64
 9   WindDir9am      132135 non-null  object 
 10  WindDir3pm      138370 non-null  object 
 11  WindSpeed9am    142146 non-null  float64
 12  WindSpeed3pm    142146 non-null  float64
 13  Humidity9am     142146 non-null  float64
 14  Humidity3pm     142146 non-null  float64
 15  Pressure9am     142146 non-null  float64
 16  Pressure3pm     142146 non-null  float64
 17  Cloud9am  

#### Conversion de Date

In [27]:
# weather["Date"] = pd.to_datetime(weather["Date"])

#### Conversión de RainToday y RainTomorrow a Boolean

In [28]:
weather["RainToday"] = weather["RainToday"].apply(lambda x: True if x == 1.0 else False)
weather["RainTomorrow"] = weather["RainTomorrow"].apply(lambda x: True if x == 1.0 else False)


#### Conversion de variables categoricas

In [29]:
variables_categoricas = ["Location", "WindGustDir", "WindDir3pm", "WindDir9am"]
weather_with_dummies = pd.get_dummies(weather, columns=variables_categoricas, drop_first=True)

### Resultados post conversion de variables

In [30]:
weather_with_dummies.head(5)

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,2008-07-02,12.7,15.8,0.8,1.4,7.8,35.0,13.0,15.0,75.0,...,False,False,False,False,False,True,False,False,False,False
1,2008-07-03,6.2,15.1,0.0,1.8,2.1,20.0,2.0,11.0,81.0,...,False,False,False,False,False,False,False,False,False,False
2,2008-07-04,5.3,15.9,0.0,1.4,8.0,30.0,6.0,13.0,71.0,...,False,False,False,False,False,False,False,False,False,False
3,2008-07-06,11.3,15.7,0.0,1.4,1.5,52.0,15.0,22.0,62.0,...,False,False,False,False,False,False,False,False,False,False
4,2008-07-07,7.6,11.2,16.2,4.6,1.1,46.0,17.0,13.0,83.0,...,False,False,False,False,False,False,False,False,True,False


In [31]:
weather.shape

(142146, 25)

In [32]:
weather_with_dummies.shape

(142146, 114)

In [33]:
weather_with_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142146 entries, 0 to 142145
Columns: 114 entries, Date to WindDir9am_WSW
dtypes: bool(95), float64(17), object(2)
memory usage: 33.5+ MB


### Separacion de Dataset

In [34]:
weather_with_dummies.drop(columns="Date", inplace=True)
weather_with_dummies.drop(columns="YearMonth", inplace=True)

X = weather_with_dummies.drop(columns="RainTomorrow")
y = weather_with_dummies["RainTomorrow"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Escalado de variables numericas

In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Aplicar la misma transformación al dataset de prueba
X_test_scaled = scaler.transform(X_test)

### Modelado y Evaluacion

#### Determine dos métricas de evaluación que considere importante medir para este problema. Justifique su elección.

Para predecir si va a llover mañana, las métricas de evaluación más relevantes podrían ser las siguientes:

Exactitud (Accuracy):
La exactitud es una métrica básica que mide la proporción de predicciones correctas sobre el total de predicciones realizadas. Es importante en este contexto porque proporciona una visión general de qué tan bien el modelo está prediciendo si va a llover o no. Sin embargo, en problemas de clasificación binaria con clases desbalanceadas, la exactitud por sí sola puede no ser suficiente.

F1-Score:
El F1-Score es la media armónica de la precisión y la sensibilidad (recall). Es especialmente útil cuando se trata de datos desbalanceados, como podría ser el caso de predecir si va a llover, donde puede haber más días sin lluvia que con lluvia. El F1-Score ofrece un balance entre la precisión y la capacidad del modelo de identificar correctamente los días que realmente van a llover.

Estas métricas juntas te permitirán evaluar no solo la capacidad general del modelo, sino también cómo maneja los falsos positivos y negativos, que son críticos en la toma de decisiones basadas en predicciones de lluvia.

#### Implemente un modelo de base usando como predicción que determine que llueve mañana si hoy llueve, y si hoy no llueve mañana no va a llover.


In [36]:
#TODO: Chequera que este bien hacer esto con weather dummys
# Aplicar la lógica de predicción: Si llueve hoy, predice que lloverá mañana
weather_with_dummies['BasePrediction'] = weather_with_dummies['RainToday']

# Calcular la exactitud del modelo de base
accuracy = (weather_with_dummies['BasePrediction'] == weather_with_dummies['RainTomorrow']).mean()

print(f"Exactitud del modelo de base: {accuracy:.2f}")


Exactitud del modelo de base: 0.76


#### Implemente y entrene un modelo de k Nearest Neighbors (kNN) para predecir si va a llover mañana. Utilice la distancia euclidiana como medida de distancia entre vecinos.Ajuste el parámetro k utilizando técnicas como la validación cruzada (5 folds) para optimizar el rendimiento del modelo (utilice como función de optimización una de las métricas definidas en el punto anterior).

In [39]:

# Checkear dimensiones
print(f"Dimensiones de X_train_scaled: {X_train_scaled.shape}")
print(f"Longitud de y_train: {len(y_train)}")
# Definir el clasificador kNN
knn = KNeighborsClassifier(metric='euclidean')

# Definir los parámetros de la búsqueda en grilla
param_grid = {'n_neighbors': list(range(1, 21))}  # Probamos k entre 1 y 20

# Definir el F1-Score como la métrica a optimizar
f1_scorer = make_scorer(f1_score)

# Configurar la validación cruzada con GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring=f1_scorer)
grid_search.fit(X_train_scaled, y_train)

# Obtener los mejores parámetros y el mejor score
best_k = grid_search.best_params_['n_neighbors']
best_f1_score = grid_search.best_score_

print(f"Mejor valor de k: {best_k}")
print(f"Mejor F1-Score obtenido en validación cruzada: {best_f1_score:.2f}")

# Entrenar el modelo final con el mejor valor de k
best_knn = KNeighborsClassifier(n_neighbors=best_k, metric='euclidean')
best_knn.fit(X_train_scaled, y_train)

# Predecir usando el modelo entrenado
predictions = best_knn.predict(X_train_scaled)

# Calcular y mostrar la F1-Score en los datos de entrenamiento
final_f1_score = f1_score(y, predictions)
print(f"F1-Score en los datos de entrenamiento: {final_f1_score:.2f}")

Dimensiones de X_train_scaled: (99502, 111)
Longitud de y_train: 99502


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 238, in fit
    return self._fit(X, y)
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 475, in _fit
    X, y = self._validate_data(
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1301, in check_X_y
    X = check_array(
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1064, in check_array
    _assert_all_finite(
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 123, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "/Users/joaquingc/Desktop/IOT/ML/CEIOT-ML-TP1/.venv/lib/python3.9/site-packages/sklearn/utils/validation.py", line 172, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
