# Notebook para análise exploratória dos dados climáticos

In [37]:
import pandas as pd
import numpy as np
from typing import Dict
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
import pprint

In [29]:
# Função para limpar os dados
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Aplica limpeza e renomeação de colunas."""
    df = df.drop(columns=[
        'pressure_in', 'temperature_fahrenheit', 'wind_mph',
        'precip_in', 'feels_like_fahrenheit', 'visibility_miles', 'gust_mph'
    ])
    df = df.rename(columns={
        'temperature_celsius': 'temperature',
        'pressure_mb': 'pressure',
        'wind_kph': 'wind_speed',
        'precip_mm': 'precipitation',
        'last_updated': 'recorded_at'
    })
    return df

In [30]:

# Função para engenharia de features
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    """Cria novas features a partir dos dados existentes."""
    df['recorded_at'] = pd.to_datetime(df['recorded_at'])
    df['hour'] = df['recorded_at'].dt.hour
    df['day_of_week'] = df['recorded_at'].dt.dayofweek
    df['month'] = df['recorded_at'].dt.month
    df['dew_point'] = _calculate_dew_point(df['temperature'], df['humidity'])
    df['pressure_tendency'] = df['pressure'].diff().fillna(0)
    df['temp_humidity_interaction'] = df['temperature'] * df['humidity']
    df['wind_pressure_interaction'] = df['wind_speed'] * df['pressure']
    df['will_rain'] = (df['precipitation'] > 0).astype(int)
    return df

# Função auxiliar para calcular o ponto de orvalho
def _calculate_dew_point(temp: pd.Series, humidity: pd.Series) -> pd.Series:
    """Calcula o ponto de orvalho."""
    a, b = 17.27, 237.7
    alpha = ((a * temp) / (b + temp)) + np.log(humidity / 100)
    return (b * alpha) / (a - alpha)

# Função para preparar as features
def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
    """Seleciona e prepara as features para o modelo."""
    feature_columns = [
        'temperature', 'humidity', 'pressure', 'wind_speed',
        'dew_point', 'pressure_tendency',
        'temp_humidity_interaction', 'wind_pressure_interaction',
        'hour', 'day_of_week', 'month'
    ]
    return df[[col for col in feature_columns if col in df.columns]]

In [31]:
def _evaluate_model(model, X_test: pd.DataFrame, y_test: pd.Series) -> Dict:
    """Avalia performance do modelo"""
    # Predições
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Métricas
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='roc_auc')
    
    # Relatório de classificação
    class_report = classification_report(y_test, y_pred, output_dict=True)
    
    # Matriz de confusão
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    metrics = {
        'auc': auc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'precision': class_report['1']['precision'],
        'recall': class_report['1']['recall'],
        'f1_score': class_report['1']['f1-score'],
        'accuracy': class_report['accuracy'],
        'confusion_matrix': conf_matrix.tolist()
    }
    
    return metrics

In [32]:

# Função para treinar o modelo
def train_model(df: pd.DataFrame, target_column: str = 'will_rain') -> Dict:
    """Treina o modelo de previsão."""
    X = prepare_features(df)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(
            n_estimators=100, max_depth=10, random_state=42, n_jobs=-1
        ))
    ])
    model.fit(X_train, y_train) 
    # Avaliar modelo
    metrics = _evaluate_model(model, X_test, y_test)
    return metrics


In [None]:
df = pd.read_csv('/home/nicholas/projects/IOT/data/raw/GlobalWeatherRepository.csv')
# Limpar os dados
df_clean = clean_data(df)

# Aplicar engenharia de features
df_features = feature_engineering(df_clean)

# Treinar o modelo
results = train_model(df_features)

# Exibir a acurácia do modelo
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results)

{   'accuracy': 0.7760227547809248,
    'auc': 0.840698049704051,
    'confusion_matrix': [[9549, 1413], [2288, 3274]],
    'cv_mean': np.float64(0.8347517801209173),
    'cv_std': np.float64(0.009116744803299473),
    'f1_score': 0.6388915991804078,
    'precision': 0.6985278429699168,
    'recall': 0.5886371808701906}


In [34]:
 
def get_feature_importance(model, df) -> pd.DataFrame:
    """Retorna importância das features"""        
    # Obter importância do Random Forest
    importance = model.named_steps['classifier'].feature_importances_
    
    # Criar DataFrame com importância
    importance_df = pd.DataFrame({
        'feature': df.feature_columns,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    return importance_df