## Librerías

In [2]:
# Data
import pandas as pd
import numpy as np
from datetime import datetime

# Preprocessing data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ML metrics
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, auc

# Machine Learning Pipeline & process
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Warnings
import warnings
warnings.simplefilter('ignore')

In [3]:
import tensorflow as tf
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from tensorflow.keras.models import Sequential, load_model, save_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from keras.models import load_model

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Intro

In [4]:
df_original = pd.read_csv('/content/drive/MyDrive/weatherAUS.csv', sep=',', engine='python')

In [5]:
# Copia del DF original con los registros correspondientes a la costa sureste
ciudades_a_conservar = ['Sydney', 'SydneyAirport', 'Canberra', 'Melbourne', 'MelbourneAirport']
df = df_original[df_original['Location'].isin(ciudades_a_conservar)].copy()
df['Location'] = df['Location'].astype('category')
# Agrupar por etiqueta y contar la frecuencia
Location_frec = df['Location'].value_counts()
Location_frec

Canberra            3435
Sydney              3343
Melbourne           3192
MelbourneAirport    3008
SydneyAirport       3008
Name: Location, dtype: int64

In [6]:
df = df.drop(['Unnamed: 0'], axis=1)

In [7]:
df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

In [8]:
# Imputación por moda para cada ciudad
df['WindGustDir'] = df.groupby('Location')['WindGustDir'].transform(lambda x: x.fillna(x.mode()[0]))
df['WindDir9am'] = df.groupby('Location')['WindDir9am'].transform(lambda x: x.fillna(x.mode()[0]))
df['WindDir3pm'] = df.groupby('Location')['WindDir3pm'].transform(lambda x: x.fillna(x.mode()[0]))

In [9]:
df = df.drop(['Location'], axis=1)

In [10]:
# Lista de variables numéricas
t = (df.dtypes == "float64")
num_cols = list(t[t].index)
# Se reemplaza valores nulos por las medianas de cada columna que los contiene
for i in num_cols:
    df[i].fillna(df[i].median(), inplace=True)

In [11]:
df = df.copy().reset_index(drop=True)

In [12]:
df.shape

(15036, 23)

In [13]:
pipe_train, pipe_test = train_test_split(df, test_size=0.2, random_state=42)

In [14]:
pipe_train.shape, pipe_test.shape

((12028, 23), (3008, 23))

In [15]:
X_pipe_train = pipe_train.drop(['RainfallTomorrow', 'RainTomorrow'], axis=1).copy()
y_pipe_train = pipe_train[['RainfallTomorrow', 'RainTomorrow']].copy()

X_pipe_test = pipe_test.drop(['RainfallTomorrow', 'RainTomorrow'], axis=1).copy()
y_pipe_test = pipe_test[['RainfallTomorrow', 'RainTomorrow']].copy()

In [16]:
X_pipe_train.shape, y_pipe_train.shape, X_pipe_test.shape, y_pipe_test.shape

((12028, 21), (12028, 2), (3008, 21), (3008, 2))

In [17]:
y_pipe_train_clf = y_pipe_train['RainTomorrow'].copy()
y_pipe_train_reg = y_pipe_train['RainfallTomorrow'].copy()

y_pipe_test_clf = y_pipe_test['RainTomorrow'].copy()
y_pipe_test_reg = y_pipe_test['RainfallTomorrow'].copy()

In [18]:
y_pipe_train_clf.shape, y_pipe_train_reg.shape, y_pipe_test_clf.shape, y_pipe_test_reg.shape

((12028,), (12028,), (3008,), (3008,))

Extracción de fechas (+1)

Recategorización de coordenadas (+3)

Recategorización Yes/No

## Clases

In [15]:
class DateExtraction(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Asegurar que X es un DataFrame
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['Date'])

        # Convertir la columna 'Date' a datetime
        X['Date'] = pd.to_datetime(X['Date'])

        # Extraer el mes y calcular las funciones sin y cos
        X['Month_sin'] = np.sin(2 * np.pi * X['Date'].dt.month / 12).round(5)
        X['Month_cos'] = np.cos(2 * np.pi * X['Date'].dt.month / 12).round(5)

        # Eliminar la columna original 'Date'
        X = X.drop(['Date'], axis=1)

        # Devolver un array de NumPy X.values
        return X

In [4]:
class CoordRecat(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        mapeo_coord = {
            'E': 0, 'ENE': 22.5, 'NE': 45, 'NNE': 67.5,
            'N': 90, 'NNW': 112.5, 'NW': 135, 'WNW': 157.5,
            'W': 180, 'WSW': 202.5, 'SW': 225, 'SSW': 247.5,
            'S': 270, 'SSE': 292.5, 'SE': 315, 'ESE': 337.5,
        }

        # Aplicar la recategorización
        for col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
            X[col] = X[col].map(mapeo_coord)
            X[f'{col}_rad'] = np.deg2rad(X[col])
            X[f'{col}_sin'] = np.sin(X[f'{col}_rad']).round(5)
            X[f'{col}_cos'] = np.cos(X[f'{col}_rad']).round(5)

        # Eliminar columnas originales y columnas radianes
        columns_to_drop = [f'{col}_rad' for col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']] + ['WindGustDir', 'WindDir9am', 'WindDir3pm']
        X = X.drop(columns=columns_to_drop, axis=1)

        return X

In [5]:
class YesNoRecat(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Mapear 'No' a 0 y 'Yes' a 1
        X['RainToday'] = X['RainToday'].map({'No': 0, 'Yes': 1}).astype(float)
        return X

In [6]:
class StandardScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_to_scale = None
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.columns_to_scale = X.columns
        self.scaler.fit(X)
        return self

    def transform(self, X):
        # Aplicar la estandarización a todas las columnas
        X[self.columns_to_scale] = self.scaler.transform(X[self.columns_to_scale])

        return X

In [88]:
class ManualStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, scaler_path='scaler_model_25.pkl'):
        self.scaler_path = scaler_path
        self.mean_ = None
        self.scale_ = None
        self.exclude_columns = ['RainToday']

    def fit(self, X, y=None):
        # Cargar el scaler previamente ajustado
        scaler = joblib.load(self.scaler_path)
        # Excluir las columnas especificadas del ajuste del scaler
        columns_to_scale = [col for col in X.columns if col not in self.exclude_columns]
        self.mean_ = scaler.mean_
        self.scale_ = scaler.scale_
        return self

    def transform(self, X):
        # Excluir las columnas especificadas de la transformación
        columns_to_scale = [col for col in X.columns if col not in self.exclude_columns]

        # Asegurar que las columnas a escalar estén presentes en X
        if not set(columns_to_scale).issubset(set(X.columns)):
            raise ValueError("Columns to scale not found in input DataFrame.")

        # Calcular la normalización manualmente
        X_scaled = (X[columns_to_scale] - self.mean_) / self.scale_

        # Mantener las columnas excluidas sin cambios
        X_scaled[self.exclude_columns] = X[self.exclude_columns]

        return X_scaled

In [25]:
# class ClassifierModel(BaseEstimator, TransformerMixin):
#     def __init__(self, model=None):
#         self.model = model

#     def fit(self, X, y):
#         # Obtener las dimensiones de entrada
#         input_shape = X.shape[1]

#         # Definir la arquitectura del modelo
#         self.model = tf.keras.Sequential([
#             tf.keras.layers.Dense(128, activation='relu', input_shape=(input_shape,)),
#             tf.keras.layers.Dropout(0.5),
#             tf.keras.layers.Dense(64, activation='relu'),
#             tf.keras.layers.Dropout(0.3),
#             tf.keras.layers.Dense(1, activation='sigmoid')
#         ])

#         # Compilar el modelo
#         self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['mean_squared_error'])

#         # Convertir las etiquetas a un formato adecuado para el modelo
#         y = np.array(y).reshape(-1, 1)

#         # Entrenar el modelo
#         self.model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

#         return self

#     def transform(self, X):
#         # Convertir a float32
#         X = X.astype('float32')

#         # Devolver las predicciones del modelo
#         return self.model.predict(X)

In [8]:
class ClassifierModel(BaseEstimator, TransformerMixin):
    def __init__(self, model_path='nn_clf_model.h5'):
        self.model_path = model_path
        self.model = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.model is None:
            self.model = load_model(self.model_path)

        # Realizar predicciones
        clf_pred = self.model.predict(X)
        return clf_pred

    def predict(self, X):
        return self.transform(X)

In [9]:
class RegressorModel(BaseEstimator, TransformerMixin):
    def __init__(self, model_path='nn_reg_model.h5'):
        self.model_path = model_path
        self.model = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.model is None:
            self.model = load_model(self.model_path)

        # Realizar predicciones
        clf_pred = self.model.predict(X)
        return clf_pred

    def predict(self, X):
        return self.transform(X)


In [89]:
date_extraction = DateExtraction()
coord_recat = CoordRecat()
yes_no_recat = YesNoRecat()
# standard_scaler_transf = StandardScalerTransformer()
manual_scaler = ManualStandardScaler()

## Pipeline

In [162]:
pipe_clf = Pipeline([
    ('date_extraction', DateExtraction()),
    ('coord_recat', CoordRecat()),
    ('yesno_recat', YesNoRecat()),
    ('standard_scaler', StandardScalerTransformer()),
    ('classifier_model', ClassifierModel()),
])
pipe_clf

## Fit Transform - Train

In [127]:
# Aplica clase al conjunto de entrenamiento
# X_pipe_train_transformed = date_extraction.fit_transform(X_pipe_train)
# X_pipe_train_transformed = coord_recat.fit_transform(X_pipe_train_transformed)
# X_pipe_train_transformed = yes_no_recat.fit_transform(X_pipe_train_transformed)
# X_pipe_train_transformed = standard_scaler_transf.fit_transform(X_pipe_train_transformed)
# X_pipe_train_transformed = manual_scaler.fit_transform(X_pipe_train_transformed)

# Imprime el DataFrame resultante
# X_pipe_train_transformed

## Test

In [90]:
# Definir los datos de entrada - 21 características
data = {
    'Date': ['2023-01-17'],
    'MinTemp': [20.00],
    'MaxTemp': [25.00],
    'Rainfall': [20.00],
    'Evaporation': [2.20],
    'Sunshine': [1.00],
    'WindGustSpeed': [45.00],
    'WindSpeed9am': [8.00],
    'WindSpeed3pm': [5.00],
    'Humidity9am': [35.00],
    'Humidity3pm': [95.00],
    'Pressure9am': [1015.00],
    'Pressure3pm': [1000.00],
    'Cloud9am': [3.00],
    'Cloud3pm': [9.00],
    'Temp9am': [20.00],
    'Temp3pm': [25.00],
    'WindGustDir': ['NW'],
    'WindDir9am': ['NNW'],
    'WindDir3pm': ['NW'],
    'RainToday': ['Yes']
    }

In [108]:
df_eval = pd.DataFrame(data)

In [109]:
df_eval.shape

(1, 21)

In [110]:
# Aplica clase al conjunto de evaluación
X_eval_transformed = date_extraction.fit_transform(df_eval)
X_eval_transformed = coord_recat.fit_transform(X_eval_transformed)
X_eval_transformed = yes_no_recat.fit_transform(X_eval_transformed)
# X_pipe_train_transformed = standard_scaler_transf.fit_transform(X_pipe_train_transformed)
X_eval_transformed = manual_scaler.fit_transform(X_eval_transformed)

# Imprime el DataFrame resultante
X_eval_transformed

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Temp3pm,Month_sin,Month_cos,WindGustDir_sin,WindGustDir_cos,WindDir9am_sin,WindDir9am_cos,WindDir3pm_sin,WindDir3pm_cos,RainToday
0,1.425319,0.527723,2.390534,-0.901967,-1.670494,0.058533,-0.827234,-1.767735,-2.170778,2.468048,...,0.806024,0.705769,1.24297,0.906032,-0.837132,1.088993,-0.178166,0.976239,-1.070635,1.0


## Predicciones

### Predicción de clasificación

In [111]:
# Predicción de clasificación
classifier_model = ClassifierModel(model_path='nn_clf_model.h5')

In [112]:
X_eval = X_eval_transformed
clf_predictions = classifier_model.transform(X_eval)



In [113]:
clf_predictions

array([[0.9993719]], dtype=float32)

In [116]:
threshold = 0.5  # Umbral de decisión
binary_predictions = (clf_predictions > threshold).astype(int)

In [117]:
binary_predictions[0][0]

1

In [118]:
weather_label = "llueve" if binary_predictions[0][0] == 1 else "no llueve"

In [120]:
print("Mañana", weather_label, "- Probabilidad:", clf_predictions[0][0]*100)

Mañana llueve - Probabilidad: 99.9371886253357


### Predicción de regresión

In [122]:
# Predicción de Regresión
regressor_model = RegressorModel(model_path='nn_reg_model.h5')

In [123]:
X_eval = X_eval_transformed
reg_predictions = regressor_model.transform(X_eval)



In [124]:
reg_predictions[0][0]

0.83832026

In [125]:
rainfall_tomorrow_mean = 2.361242 # Reemplaza con el valor real
rainfall_tomorrow_std = 8.479084  # Reemplaza con el valor real

# Invertir la normalización manualmente
original_prediction = (reg_predictions[0][0] * rainfall_tomorrow_std) + rainfall_tomorrow_mean

In [126]:
original_prediction

9.469429863416671