In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.over_sampling import SMOTE
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Tratamento dos dados

In [2]:
df = pd.read_csv("data/results/time_data.csv", sep=";")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12960 entries, 0 to 12959
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   time         12960 non-null  object 
 1   812A         12960 non-null  float64
 2   833A         12960 non-null  float64
 3   812A_tweets  12960 non-null  float64
 4   833A_tweets  12960 non-null  float64
 5   812A_flood   12960 non-null  int64  
 6   833A_flood   12960 non-null  int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 708.9+ KB


In [4]:
df["time"] = pd.to_datetime(df["time"])

  df["time"] = pd.to_datetime(df["time"])


In [5]:
df = df.drop(["812A_tweets", "833A_tweets"], axis=1)

In [6]:
def plot_rain_gauge(df):
    fig, axs = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

    columns = ["812A", "833A"]

    for i, col in enumerate(columns):
        axs[i].plot(df["time"], df[col], label=col, color=f"C{i}")
        axs[i].set_title(f"Série Temporal: {col}")
        axs[i].set_ylabel("Valor")
        axs[i].legend()

    axs[-1].set_xlabel("Data")

    plt.tight_layout()
    plt.show()

In [7]:
# plot_rain_gauge(df)

In [8]:
df['833A'] = df['833A'].apply(lambda x: 2.5 if x > 2.5 else x)

In [9]:
df["flood"] = 0

In [10]:
df.loc[((df["812A_flood"] == 1) | (df["833A_flood"] == 1)), "flood"] = 1

In [11]:
X = df[["812A", "833A"]]
y = df["flood"]

# Setup

In [12]:
class FloodDataset(Dataset):
    def __init__(self, data, sequence_length, predict_ahead):
        self.data = data
        self.sequence_length = sequence_length
        self.predict_ahead = predict_ahead

    def __len__(self):
        return len(self.data) - self.sequence_length - self.predict_ahead + 1

    def __getitem__(self, idx):
        # Input sequence: rain_gauge_1 and rain_gauge_2 for the past `sequence_length` time steps
        x = self.data[idx:idx+self.sequence_length, :-1]  # Features: rain_gauge_1, rain_gauge_2
        
        # Target sequence: flood_boolean for the next `predict_ahead` time steps
        y = self.data[idx+self.sequence_length:idx+self.sequence_length+self.predict_ahead, -1]  # Labels: flood_boolean
        
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [13]:
class FloodLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, predict_ahead):
        super(FloodLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.predict_ahead = predict_ahead
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size * predict_ahead)  # Output `predict_ahead` values
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])  # Use the last time step's hidden state
        out = out.view(-1, self.predict_ahead)  # Reshape to (batch_size, predict_ahead)
        out = self.sigmoid(out)  # Apply sigmoid to each output
        return out

# Hyperparameters
input_size = 2  # rain_gauge_1 and rain_gauge_2
hidden_size = 50
num_layers = 2
output_size = 1  # flood_boolean (binary)
predict_ahead = 1

model = FloodLSTM(input_size, hidden_size, num_layers, output_size, predict_ahead)

In [14]:
df = df.set_index("time")
df = df[["812A", "833A", "flood"]]

In [15]:
split_date = pd.to_datetime("2019-03-15 00:00:00-03:00")
train_data = df.loc[df.index < split_date]
test_data = df.loc[df.index >= split_date]

In [16]:
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [31]:
train_data.values.shape

(10530, 3)

In [17]:
class_counts = np.bincount(train_data["flood"])
class_weights = 1. / class_counts
samples_weights = class_weights[train_data["flood"]]

In [18]:
sequence_length = 6
dataset = FloodDataset(train_data.values, sequence_length, 1)

In [19]:
sampler = WeightedRandomSampler(samples_weights, len(samples_weights))
dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)

In [20]:
import torch.optim as optim
# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)  # Compare predictions with ground truth for 3 time steps

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}')

Epoch [1/10], Step [10/330], Loss: 0.6798
Epoch [1/10], Step [20/330], Loss: 0.6609
Epoch [1/10], Step [30/330], Loss: 0.5111
Epoch [1/10], Step [40/330], Loss: 0.6558
Epoch [1/10], Step [50/330], Loss: 0.6368
Epoch [1/10], Step [60/330], Loss: 0.5831
Epoch [1/10], Step [70/330], Loss: 0.6396


RuntimeError: stack expects each tensor to be equal size, but got [6, 2] at entry 0 and [2, 2] at entry 14

In [71]:
# Assuming you have a test dataset
test_dataset = FloodDataset(test_data, sequence_length, predict_ahead)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions
        total += labels.size(0) * labels.size(1)  # Total number of predictions
        correct += (predicted == labels).sum().item()  # Count correct predictions

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

NameError: name 'test_data' is not defined