In [89]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [90]:
class ClimateDataset(Dataset):
    def __init__(self, data, sequence_length=24):
        self.data = data
        self.sequence_length = sequence_length

        # Verificar se há mais de uma classe
        unique_classes = np.unique(data[:, -1])
        assert len(unique_classes) > 1, f"Dataset possui apenas uma classe: {unique_classes}"

    def __len__(self):
        return len(self.data) - self.sequence_length + 1

    def __getitem__(self, idx):
        x = self.data[idx : idx + self.sequence_length, :-1]  # Features
        y = self.data[idx + self.sequence_length - 1, -1]  # Target
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [108]:
file_path = '../../dataset2001_2024.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,YEAR,MO,DY,HR,ALLSKY_SFC_UV_INDEX,ALLSKY_SRF_ALB,CLRSKY_KT,CLOUD_AMT,T2M,PS,PW,WD10M,WD50M,WS50M,WS10M,TOA_SW_DNI,QV2M,QV10M
0,2001,1,1,0,0.00,-999.00,-999.00,32.70,18.93,93.39,2.72,140.07,139.57,6.24,4.25,0.00,13.79,13.73
1,2001,1,1,1,0.00,-999.00,-999.00,24.88,18.54,93.40,2.66,138.43,137.82,6.18,4.06,0.00,13.55,13.49
2,2001,1,1,2,0.00,-999.00,-999.00,51.42,18.21,93.36,2.61,138.09,137.45,6.08,3.89,0.00,13.37,13.31
3,2001,1,1,3,0.00,-999.00,-999.00,66.36,17.94,93.31,2.55,137.70,136.99,5.89,3.63,0.00,13.18,13.18
4,2001,1,1,4,0.00,-999.00,-999.00,51.16,17.69,93.27,2.50,137.93,137.20,5.62,3.35,0.00,13.06,13.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201595,2023,12,31,19,2.92,0.15,0.67,31.79,28.00,93.33,2.72,155.45,154.80,6.44,5.66,1408.66,11.47,11.29
201596,2023,12,31,20,0.80,0.13,0.58,20.45,25.87,93.38,2.70,152.49,152.07,7.36,6.12,1408.61,11.72,11.66
201597,2023,12,31,21,0.05,0.16,0.38,31.40,23.62,93.45,2.70,148.69,148.45,7.88,6.21,1408.76,12.08,12.02
201598,2023,12,31,22,0.00,-999.00,-999.00,21.06,21.87,93.52,2.71,145.60,145.61,7.41,5.61,0.00,12.45,12.39


In [92]:
data.replace(-999.00, np.nan, inplace=True)
data.fillna(data.mean(), inplace=True)
data['extreme_event'] = (data['T2M'] > 36).astype(int)

In [93]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data.iloc[:, 2:])

In [94]:
X = normalized_data[:, :-1]  # Todas as colunas menos o target
y = normalized_data[:, -1]   # Apenas o target

In [95]:
unique, counts = np.unique(y, return_counts=True)
print(f"Distribuição original das classes: {dict(zip(unique, counts))}")

# Reamostrar usando SMOTE para lidar com desbalanceamento
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

Distribuição original das classes: {np.float64(0.0): np.int64(201366), np.float64(1.0): np.int64(234)}


In [96]:
# Verificar a distribuição após SMOTE
unique_resampled, counts_resampled = np.unique(y_resampled, return_counts=True)
print(f"Distribuição após SMOTE: {dict(zip(unique_resampled, counts_resampled))}")


Distribuição após SMOTE: {np.float64(0.0): np.int64(201366), np.float64(1.0): np.int64(201366)}


In [97]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# Verificar a distribuição após a divisão
print(f"Distribuição no conjunto de treino: {dict(zip(*np.unique(y_train, return_counts=True)))}")
print(f"Distribuição no conjunto de teste: {dict(zip(*np.unique(y_test, return_counts=True)))}")

# Criar datasets PyTorch
sequence_length = 24
train_dataset = ClimateDataset(np.hstack((X_train, y_train.reshape(-1, 1))), sequence_length=sequence_length)
test_dataset = ClimateDataset(np.hstack((X_test, y_test.reshape(-1, 1))), sequence_length=sequence_length)

# Criar DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Distribuição no conjunto de treino: {np.float64(0.0): np.int64(161092), np.float64(1.0): np.int64(161093)}
Distribuição no conjunto de teste: {np.float64(0.0): np.int64(40274), np.float64(1.0): np.int64(40273)}


## Modelo do Transformer

In [98]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, ff_dim, num_layers, seq_length):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim * seq_length, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = x.flatten(1)  # Flatten para conectar à camada final
        x = self.fc(x)
        return torch.sigmoid(x)


## Treinamento do Modelo

In [99]:
# Parâmetros do modelo
input_dim = X.shape[1]  # Número de features
embed_dim = 16
num_heads = 2
ff_dim = 64
num_layers = 2

# Instanciar o modelo
model = TransformerModel(input_dim=input_dim, embed_dim=embed_dim, num_heads=num_heads, 
                         ff_dim=ff_dim, num_layers=num_layers, seq_length=sequence_length)

# Função de perda com pesos para lidar com desbalanceamento
class_weights = torch.tensor([1.0, 10.0], dtype=torch.float32)  # Ajuste de pesos
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Função de treinamento
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(x_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

train_model(model, train_loader, criterion, optimizer, epochs=10)




Epoch 1/10, Loss: 0.0194
Epoch 2/10, Loss: 0.0064
Epoch 3/10, Loss: 0.0055
Epoch 4/10, Loss: 0.0045
Epoch 5/10, Loss: 0.0039
Epoch 6/10, Loss: 0.0035
Epoch 7/10, Loss: 0.0035
Epoch 8/10, Loss: 0.0034
Epoch 9/10, Loss: 0.0033
Epoch 10/10, Loss: 0.0029


In [100]:
def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            outputs = model(x_batch).squeeze()
            y_pred.extend(outputs.numpy())
            y_true.extend(y_batch.numpy())
    
    y_pred_binary = (np.array(y_pred) > 0.5).astype(int)  # Threshold para classificação binária
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred_binary))
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred_binary))
    print(f"\nROC-AUC: {roc_auc_score(y_true, y_pred):.4f}")

evaluate_model(model, test_loader)


Confusion Matrix:
[[40226    40]
 [    2 40256]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     40266
         1.0       1.00      1.00      1.00     40258

    accuracy                           1.00     80524
   macro avg       1.00      1.00      1.00     80524
weighted avg       1.00      1.00      1.00     80524


ROC-AUC: 1.0000


In [106]:
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Passo 1: Preparar os dados históricos
historical_temperatures = data['T2M']  # Coluna do dataset original
timestamps = pd.date_range(start='2000-01-01', periods=len(historical_temperatures), freq='H')  # Exemplo de timestamps
historical_df = pd.DataFrame({
    'Timestamp': timestamps,
    'Historical': historical_temperatures
})

# Preparar as sequências históricas corretamente
sequence_length = 24  # Número de passos no tempo usados pelo modelo
num_features = 1  # Apenas a temperatura está sendo usada

# Criar janelas deslizantes para o modelo
X_historical = []
for i in range(len(historical_temperatures) - sequence_length):
    X_historical.append(historical_temperatures[i:i + sequence_length])

# Transformar em tensor PyTorch com as dimensões esperadas
X_historical = torch.tensor(X_historical, dtype=torch.float32).unsqueeze(-1)  # Shape: (n_samples, sequence_length, num_features)

print(X_historical.shape)  # Deve ser algo como (n_samples, sequence_length, num_features)

# Previsão com o modelo Transformer
model.eval()
with torch.no_grad():
    y_pred = model(X_historical).squeeze().numpy()

# Passo 3: Ajustar timestamps para as previsões
timestamps_pred = timestamps[sequence_length:]  # As previsões começam após o período inicial de entrada

# Criar DataFrame para as previsões
predicted_df = pd.DataFrame({
    'Timestamp': timestamps_pred,
    'Predicted': y_pred
})

# Passo 4: Combinar ambos os DataFrames para visualização
comparison_df = pd.merge(historical_df, predicted_df, on='Timestamp', how='inner')

# Passo 5: Plotar os dados
plt.figure(figsize=(14, 8))

# Linha para temperaturas históricas
plt.plot(comparison_df['Timestamp'], comparison_df['Historical'], label='Temperatura Histórica', color='blue', alpha=0.6)

# Linha para previsões
plt.plot(comparison_df['Timestamp'], comparison_df['Predicted'], label='Temperatura Prevista', color='orange', alpha=0.8)

# Personalizar o gráfico
plt.title('Comparação: Temperatura Prevista vs. Temperatura Histórica', fontsize=16)
plt.xlabel('Tempo', fontsize=14)
plt.ylabel('Temperatura (°C)', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()


  timestamps = pd.date_range(start='2000-01-01', periods=len(historical_temperatures), freq='H')  # Exemplo de timestamps


torch.Size([201576, 24, 1])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4837824x1 and 16x16)