In [None]:
import os
import numpy as np
import pandas as pd
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

QUERY = """
    SELECT 
        year, mo, da, 
        temp, dewp, slp, stp, wdsp, mxpsd, max, min, prcp, sndp, thunder
    FROM `bigquery-public-data.noaa_gsod.gsod*`
    WHERE _TABLE_SUFFIX BETWEEN '2015' AND '2025'
    LIMIT 10000000
"""

print("Descargando datos desde BigQuery...")
client = bigquery.Client(project="cc-proyect")
df = client.query(QUERY).to_dataframe()

print("Limpiando datos...")

df['target'] = df['thunder']

sentinels = {
    'temp': 9999.9,
    'dewp': 9999.9,
    'slp': 9999.9,
    'wdsp': 999.9,
    'sndp': 999.9,
    'prcp': 99.99
}

features = ['temp', 'dewp', 'slp', 'wdsp', 'prcp','sndp']

for col, sentinel in sentinels.items():
    if col in df.columns:
        df[col] = df[col].replace(sentinel, np.nan)
        
zero_fill_cols = ['sndp', 'prcp']
for col in zero_fill_cols:
    df[col] = df[col].fillna(0)

cols_to_drop_if_empty = ['temp', 'dewp','wdsp', 'prcp']

initial_rows = len(df)
df.dropna(subset=cols_to_drop_if_empty, inplace=True)
dropped_rows = initial_rows - len(df)

X = df[features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipeline = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler(),
    LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
)

print("Entrenando Regresión Logística Offline...")
pipeline.fit(X_train, y_train)

print("\n" + "="*40)
print("RESULTADOS DEL MODELO OFFLINE")
print("="*40)

y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_prob)

print(f"ROC AUC Score: {roc_auc:.4f}")
print("-" * 40)
print("Reporte de Clasificación:")
print(classification_report(y_test, y_pred))

Descargando datos desde BigQuery...
Limpiando datos...
Entrenando Regresión Logística Offline...

RESULTADOS DEL MODELO OFFLINE
ROC AUC Score: 0.8267
----------------------------------------
Reporte de Clasificación:
