# MODELOS

## FEATURES

precipitation,temp_max,temp_min, wind, humidity, pressure, solar_radiation, visibility & coludiness_id

IMPORTACION DE LIBRERIAS

In [9]:
# Bibliotecas estándar
import os
import joblib
import concurrent.futures
import re
import random
from datetime import datetime

# Bibliotecas científicas
import numpy as np

# Bibliotecas de análisis y modelado de datos
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report, confusion_matrix, roc_auc_score, accuracy_score

# Librerias de modelos
from statsmodels.tsa.statespace.sarimax import SARIMAX
from xgboost import XGBRegressor
from prophet import Prophet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Bibliotecas de visualización
from plotly import graph_objs as go
from prophet.plot import plot_plotly


ModuleNotFoundError: No module named 'statsmodels'

CARGA DE DATOS

In [None]:
df = pd.read_csv(r'.\Datos_Proyecto\observations_full.csv')

df['date'] = pd.to_datetime(df['date'])

df.set_index('date',inplace=True)

df = df.asfreq('D')

SARIMA

In [None]:
# Lista de variables para las que queremos hacer predicciones
variables = ['precipitation', 'temp_max', 'temp_min', 'wind', 'humidity', 'pressure', 'solar_radiation', 'visibility', 'weather_id', 'cloudiness_id']

# Ruta donde se guardarán los modelos entrenados
models_path = "sarima_models"
os.makedirs(models_path, exist_ok=True)  # Crear el directorio si no existe

# Función para ajustar y guardar el modelo si no existe, o cargarlo si ya está guardado
def fit_or_load_model(variable_name):
    model_filename = os.path.join(models_path, f"{variable_name}_sarima.pkl")
    
    if os.path.exists(model_filename):
        # Si el modelo ya existe, cargarlo
        sarima_model = joblib.load(model_filename)
        print(f"Modelo SARIMA para {variable_name} cargado desde {model_filename}")
    else:
        # Si el modelo no existe, ajustarlo y guardarlo
        model = SARIMAX(df[variable_name],
                        order=(1, 1, 1),
                        seasonal_order=(1, 1, 1, 12),
                        enforce_stationarity=False,
                        enforce_invertibility=False)
        
        sarima_model = model.fit(disp=False, method='powell')
        joblib.dump(sarima_model, model_filename, compress=4)
        print(f"Modelo SARIMA para {variable_name} ajustado y guardado en {model_filename}")
    
    return sarima_model

# Función para hacer predicciones usando un modelo ajustado o cargado
def forecast_with_model(variable_name):
    sarima_model = fit_or_load_model(variable_name)
    forecast = sarima_model.get_forecast(steps=31)
    forecast_mean = forecast.predicted_mean
    return forecast_mean

# Ejecutar las predicciones en paralelo usando ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    forecasts = list(executor.map(forecast_with_model, variables))

# Almacenar los resultados en un DataFrame para facilidad de visualización
predictions_df = pd.DataFrame({variables[i]: forecasts[i] for i in range(len(variables))})

predictions_df.to_csv(r'C:\Users\joant\OneDrive\Stucom\MasterIA\BigData\Projecte3_Meteorologia\Entregables\Code\predictions.csv')


RANDOM FOREST REGRESSOR

In [None]:
# Extract time-based features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original date column
df = df.drop(columns=['date'])

# Select features and targets
features = ['precipitation', 'temp_max','temp_min', 'wind', 'humidity', 'pressure', 
            'solar_radiation', 'visibility', 'weather_id', 'estacion_id', 'cloudiness_id', 
            'year', 'month', 'day']

targets = ['precipitation', 'temp_max','temp_min', 'wind', 'humidity', 'pressure']

for target in targets:

    features_clean = [feature for feature in features if feature != target]

    # Separate features and target
    X = df[features_clean]
    y = df[target]

    # Split data into train and test sets (using time-based split for time series)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Train a Random Forest model for prediction
    model = RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_split=5, min_samples_leaf=4, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"\nModelo {target}")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")

    # Save the model using joblib
    model_path = fr'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\SVM\features_prediction\{target}_rfr_model.pkl'
    joblib.dump(model, model_path)

    print(f"Modelo {target} guardado en: {model_path} (intro)")

PROPHET

In [None]:
df = pd.read_csv(r'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Datos_Proyecto\observations_full.csv')

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=df['date'], y=df['humidity'], name='humidity',line_color='red'))
# fig.layout.update(title_text='Time Series data with Rangeslider',xaxis_rangeslider_visible=True)
# fig.show()

targets = ['precipitation','temp_max','temp_min','wind','humidity','pressure','solar_radiation','visibility','cloudiness_id']

for target in targets:

    X = df[['date',target]]
    y = df[target]

    train_df = pd.DataFrame()
    train_df['ds'] = pd.to_datetime(X['date'])
    train_df['y']=y
    train_df.head(2)

    model = Prophet()
    model.fit(train_df)
    joblib.dump(model,fr"D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\Prophet\models\prophet_{target}.pkl")


    future = model.make_future_dataframe(periods=365)
    forecast = model.predict(future)
    forecast.to_csv(fr"D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\Prophet\Predictions_2025\predictions_{target}.csv")

    # fig1 = plot_plotly(model, forecast)
    # fig1.show()

    # #plot component wise forecast
    # fig2 = model.plot_components(forecast)
    # fig2.show()


In [None]:
# juntar las predicciones de todos los targets en un solo data frame
df_predictions = pd.DataFrame()

path = r'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\Prophet\Predictions_2025'

_, _, files = next(os.walk(path))
    
for file in files:

    df = pd.read_csv(os.path.join(path,file))

    if 'cloudiness' in file:
        df['yhat'] = df['yhat'].round(0).astype(int)

    
    df_predictions[re.search(r'predictions_(.*?).csv', file).group(1)] = df['yhat']

df_predictions.insert(0,'date',df['ds'])

df_predictions = df_predictions.drop(df_predictions.index[:25000]).reset_index(drop=True)


df_predictions.to_csv(r'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Datos_Proyecto\predictions\predictions_Prophet.csv')

XGBOOSTING

In [None]:
# Load data
df = pd.read_csv(r"D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Datos_Proyecto\observations_full.csv")

# Extract time-based features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original date column
df = df.drop(columns=['date'])

# Select features and targets
features = ['year', 'month', 'day', 'estacion_id']
X = df[features]

targets = ['precipitation', 'temp_max','temp_min', 'wind', 'humidity', 'pressure','solar_radiation','visibility','cloudiness_id']

# Hyperparameter grid
param_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [10, 20, 50],
    'min_child_weight': [5, 8],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.9, 1.0]
}

for target in targets:
    # Separate features and target
    y = df[target]

    # Split data into train and test sets (using time-based split for time series)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Initialize XGBoost model
    model_xgb = XGBRegressor(random_state=42)

    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(estimator=model_xgb, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)

    # Fit the grid search model
    grid_search.fit(X_train, y_train)

    # Best parameters from the grid search
    best_params = grid_search.best_params_
    print(f"\nMejores parámetros para el modelo {target}: {best_params}")

    # Refit the best model
    best_model = grid_search.best_estimator_

    # Evaluate the best model on the test data
    y_pred_xgb = best_model.predict(X_test)
    mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
    rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))

    print(f"\nModelo {target} - XGBoost (Con mejores parámetros)")
    print(f"Mean Absolute Error (MAE): {mae_xgb}")
    print(f"Root Mean Squared Error (RMSE): {rmse_xgb}")

    # Save the best XGBoost model
    model_xgb_path = fr'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\XGBoost\{target}_xgb_model_best.pkl'
    joblib.dump(best_model, model_xgb_path)
    print(f"Modelo {target} guardado en: {model_xgb_path} (XGB - Mejores parámetros)")


## WEATHER_ID

SVM

In [None]:
# Select features and target
features = ['year', 'month', 'day', 'precipitation', 'temp_max', 'temp_min', 'wind', 'humidity', 'pressure', 'solar_radiation', 'visibility', 'cloudiness_id']
target = 'weather_id'

# Split data into train and test sets
X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# model = SVC(kernel='rbf', C=0.1, gamma='auto', class_weight='balanced', probability=True) Model 3
model = SVC(kernel='rbf', C=0.1, gamma='auto', probability=True) # Model 4


model.fit(X_train_res, y_train_res)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# ROC AUC (for multi-class)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_scaled), multi_class='ovr')
print(f"ROC AUC: {roc_auc}")

# Save the model using joblib
joblib.dump(model, r'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\SVM\weather_id_svm_model.pkl')

RANDOM FOREST CLASSIFIER

In [None]:
# Select features and target
features = ['year', 'month', 'day', 'precipitation', 'temp_max', 'temp_min', 'wind', 'humidity', 'pressure', 'solar_radiation', 'visibility', 'cloudiness_id']
target = 'weather_id'

# Split data into train and test sets
X = df[features]
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# RANDOM FOREST

model_rf = RandomForestClassifier(class_weight='balanced', random_state=42)
model_rf.fit(X_train_res, y_train_res)
y_pred_rf = model_rf.predict(X_test_scaled)

print("Random Forest Classifier - Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

joblib.dump(model_rf,r'D:\STUCOM\Master_IABD\Projecte3_Meteorologia\Entregables\Code\RandomForest\weather_id_RF.pkl')

# PREDICCIONES

En las predicciones usare el modelo que mejores resultados ha dado respecto a sus predicciones.

Mejor modelo para features: **XGBoosting**

In [12]:
# Path to the models
path = r'.\..\Modelos\XGBoost'
modelos_xgb = os.listdir(path)

# Date range for predictions
initial_date = '2024-12-01'
final_date = '2025-12-31'

# Create a DataFrame for the date range
dates = pd.date_range(start=initial_date, end=final_date)
dates_df = pd.DataFrame({
    'year': dates.year,
    'month': dates.month,
    'day': dates.day
})

thresholds = {
    'precipitation':[13.94,1,-13.94],
    'temp_max':[9.21,1,-9.21],
    'temp_min':[5.62,1,-5.62],
    'wind':[2.24,1,-2.24],
    'humidity':[18.00,1,-18.00],
    'pressure':[17.67,1,-17.67],
    'solar_radiation':[305.49,1,-305.49],
    'visibility':[5.07,1,-5.07],
    'cloudiness_id':[0.52,1,-0.52]
}

# Initialize an empty DataFrame to store predictions
predictions_XGBoosting = pd.DataFrame(columns=[
    'date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'humidity',
    'pressure', 'solar_radiation', 'visibility', 'cloudiness_id'
])

# Loop through each date
for _, row in dates_df.iterrows():

    year, month, day = row['year'], row['month'], row['day']

    data = {
        'date': datetime.strptime(f'{year}-{month}-{day}', "%Y-%m-%d"),
        'precipitation': None,
        'temp_max': None,
        'temp_min': None,
        'wind': None,
        'humidity': None,
        'pressure': None,
        'solar_radiation': None,
        'visibility': None,
        'cloudiness_id': None
    }

    # Loop through each model
    for modelo_xgb in modelos_xgb:

        # Load the model
        modelo = joblib.load(os.path.join(path, modelo_xgb))
        
        # Extract target name from the model filename
        target_name = modelo_xgb.replace("_xgb_model_best.pkl", "")

        # Create a 2D array with the input features
        feature_row = pd.DataFrame([[year, month, day]], columns=['year', 'month', 'day'])
        
        # Predict using the model
        prediccion = (modelo.predict(feature_row)[0]) - thresholds[target_name][random.randint(0,2)]

        if "cloudiness" in modelos_xgb:
            # Update the corresponding field in the data dictionary
            data[target_name] = prediccion.round(0).astype(int)
        else:
            data[target_name] = prediccion

    # Append the data dictionary to the DataFrame
    row_df = pd.DataFrame([data])
    predictions_XGBoosting = pd.concat([predictions_XGBoosting, row_df], ignore_index=True)


# Save predictions to a CSV file
output_path = r'.\..\weather_predictions.csv'
predictions_XGBoosting.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")


  predictions_XGBoosting = pd.concat([predictions_XGBoosting, row_df], ignore_index=True)


Predictions saved to .\..\weather_predictions.csv


Mejor modelo para weather_id: **Random Forest Classifier**

In [8]:
# Cargar el DataFrame y convertir la columna de fecha
predictions = pd.read_csv(r'.\..\weather_predictions.csv')
predictions['date'] = pd.to_datetime(predictions['date'])

# Extraer año, mes y día de la fecha
predictions['year'] = predictions['date'].dt.year
predictions['month'] = predictions['date'].dt.month
predictions['day'] = predictions['date'].dt.day

# Cargar el modelo
modelo_RFC = joblib.load(r'.\..\Modelos\RandomForest\Classifier\weather_id_RFC.pkl')

# Seleccionar las características requeridas para el modelo
features = [
    'year', 'month', 'day', 'precipitation', 'temp_max', 'temp_min',
    'wind', 'humidity', 'pressure', 'solar_radiation', 'visibility', 'cloudiness_id'
]

# Crear la matriz de características para el modelo
X = predictions[features]

# Generar predicciones con el modelo
predictions['weather_id'] = modelo_RFC.predict(X).round(0).astype(int)

# Guardar el DataFrame actualizado
predictions.to_csv(r'.\..\weather_predictions.csv', index=False)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
