## Initial Steps for Working with GRIB Dataset in Python

### Step 1: Install Required Libraries
You will need pygrib, xarray, numpy, pandas, and matplotlib.
Use the following command to install them:
!pip install pygrib xarray numpy pandas matplotlib

In [1]:
import os
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

### Step 2: Load and Explore the GRIB Data

In [None]:
# Load the GRIB dataset
data = xr.open_dataset('./spain_may2sept.grib', engine="cfgrib")

# Print available coordinates
print("Coordinates of the dataset:")
print(data.coords)

# Optionally, print specific coordinate values
print("\nLatitude values:")
print(data["latitude"].values)

print("\nLongitude values:")
print(data["longitude"].values)

# If the dataset has time:
if "time" in data.coords:
    print("\nTime values:")
    print(data["time"].values)
    
print(data.data_vars)

## Generate labels for hot days

In [3]:
data_region = data.sel(
    latitude=slice(43, 38),  # From 43°N to 38°N (descending order)
    longitude=slice(-8, -1)  # From 8°W to 1°W (ascending order)
)
# Step 1: Restrict the dataset to May–August
data_summer = data_region.sel(time=data["time"].dt.month.isin([5, 6, 7, 8, 9]))

# Step 2: Extract temperature data (e.g., variable "t2m") for the summer months
temperature_summer = data_summer["t2m"]
# Step 3: Filter temperature measurements at 12:00 within summer months
# Step 3: Filter temperature measurements at 12:00 within summer months
temp_summer_12 = temperature_summer.sel(time=temperature_summer["time"].dt.hour == 12)

# Convert the time to "day" in datetime format with time set to 00:00:00
temp_summer_12 = temp_summer_12.assign_coords(day=temp_summer_12["time"].dt.floor("D"))

# Use "day" as the main dimension
temp_summer_12 = temp_summer_12.swap_dims({"time": "day"}).reset_coords("time", drop=True)

# Step 4: Extract July–August temperatures for percentile calculation
temp_july_aug = temp_summer_12.sel(day=temp_summer_12["day"].dt.month.isin([7, 8]))


# Step 5: Compute the 95th percentile for each location in July–August
percentile_95 = temp_july_aug.quantile(0.95, dim="day")

# Step 6: Label hot days (May–August) based on the 95th percentile
hot_days = temp_summer_12 > percentile_95

# Step 7: Add the "hot_day" label to the summer dataset
data_summer_labeled = data_summer.assign(hot_day=hot_days)

# Optional: Save the labeled dataset for further analysis
# data_summer_labeled.to_netcdf("labeled_summer_dataset.nc")

# Step 8: Verify the results
# print("Summer Dataset with Hot Day Labels:")
# print(data_summer_labeled.where(data_summer_labeled["hot_day"], drop=True))



In [4]:
# Filter out anomalies and hot days
hot_days_aligned = hot_days.rename({"day": "time"})  # Align dimensions
normal_days = ~hot_days_aligned  # Invert hot days to get normal days
normal_data = data_summer.where(normal_days, drop=True)

In [None]:
print(f"Hot days shape: {hot_days.shape}")
print(f"Normal days shape: {normal_days.shape}")
print(f"Total hot days: {hot_days.sum().item()}")
print(f"Total normal days: {normal_days.sum().item()}")

## Aggregate all measurements of one day under one timestamp

In [8]:
# Continue with merging and preparing features
normal_data_00 = normal_data.sel(time=normal_data["time"].dt.hour == 0)
normal_data_12 = normal_data.sel(time=normal_data["time"].dt.hour == 12)

normal_data_00 = normal_data_00.assign_coords(day=normal_data_00["time"].dt.floor("D"))
normal_data_12 = normal_data_12.assign_coords(day=normal_data_12["time"].dt.floor("D"))

normal_data_00 = normal_data_00.swap_dims({"time": "day"}).reset_coords("time", drop=True)
normal_data_12 = normal_data_12.swap_dims({"time": "day"}).reset_coords("time", drop=True)

normal_data_00 = normal_data_00.rename({var: f"{var}_00" for var in normal_data_00.data_vars})
normal_data_12 = normal_data_12.rename({var: f"{var}_12" for var in normal_data_12.data_vars})
normal_data_00 = normal_data_00.drop_vars("valid_time", errors="ignore")
normal_data_12 = normal_data_12.drop_vars("valid_time", errors="ignore")

normal_data_merged = xr.merge([normal_data_00, normal_data_12])


In [None]:
# Count the number of hot days (True values) in the "hot_day" field
num_hot_days = data_summer_labeled["hot_day"].sum().item()

print(f"Total number of hot days: {num_hot_days}")

# Count the number of hot days (True values) in the "hot_day" field
num_days = data_summer_labeled.dims["day"]
print(f"Total number of days: {num_days}")

# Sum the count of non-NaN values in "t2m" across time, latitude, and longitude
num_data_points = data_summer_labeled["t2m"].notnull().sum().item()
print(f"Total number of data points: {num_data_points}")



In [None]:
print("Temp Summer Dimensions:", temp_summer_12.shape)
print("Percentile 95 Shape:", percentile_95.shape)
print("Hot Days Shape:", hot_days.shape)
variable_names = list(data.data_vars)
print("Variable names:", variable_names)


# Extract features and labels for model training

In [None]:
import torch
processed_dir = "temp_normal_features_by_year"
os.makedirs(processed_dir, exist_ok=True)
years = np.unique(normal_data_merged["day"].dt.year)
feature_window_size = 30

for year in years:
    yearly_data = normal_data_merged.sel(day=(normal_data_merged["day"].dt.year == year) &
                                             normal_data_merged["day"].dt.month.isin([5, 6, 7, 8]))

    print(f"Year: {year}, Data Shape: {yearly_data.sizes}")

    # Ensure no NaN values in yearly data
    if yearly_data.isnull().any():
        print(f"Year {year}: Data contains NaN values. Filling NaNs with 0.")
        yearly_data = yearly_data.fillna(0)

    rolling_chunk = (
        yearly_data.rolling(day=feature_window_size, center=False)
        .construct("feature_dim")
        .dropna("day")
    )

    print(f"Rolling chunk dimensions for year {year}: {rolling_chunk.dims}")
    print(f"Rolling chunk sizes for year {year}: {rolling_chunk.sizes}")

    stacked_features = rolling_chunk.stack(location=("latitude", "longitude"))
    flattened_features = stacked_features.to_array(dim="variables").stack(
        features=("variables", "feature_dim")).transpose("day", "location", "features")

    X = flattened_features.values.reshape(flattened_features.shape[0] * flattened_features.shape[1], -1)
    
    # Extract latitude, longitude, and percentile_95
    latitudes = stacked_features["latitude"].values
    longitudes = stacked_features["longitude"].values
    percentile_95_values = percentile_95.stack(location=("latitude", "longitude"))
    repeated_percentile_95 = np.repeat(percentile_95_values, len(flattened_features["day"]), axis=0)

    repeated_latitudes = np.tile(latitudes, len(flattened_features["day"]))
    repeated_longitudes = np.tile(longitudes, len(flattened_features["day"]))

    lat_long_percentile = np.column_stack([repeated_latitudes, repeated_longitudes, repeated_percentile_95])
    X = np.hstack((X, lat_long_percentile))

    torch.save(torch.tensor(X, dtype=torch.float32), f"{processed_dir}/features_{year}.pt")

## Creating Target labels for each day (if the next 7 days contain at least 3 hot days)

In [None]:
# Creating labels for 7-day windows with at least 3 hot days
hot_days_rolling = (
    hot_days.rolling(day=7, center=False)
    .construct("window_dim")
    .reduce(np.sum, dim="window_dim")
)
labels_next_7_days = (hot_days_rolling >= 3).astype(int)
labels_next_7_days = labels_next_7_days.shift(day=-6).dropna("day")

# Drop NaN values (caused by shifting)
labels_next_7_days = labels_next_7_days.dropna("day")

# Debug: Inspect the rolling sum
print("Rolling sum of hot days over 7-day window:")
print(hot_days_rolling)

# Debug: Inspect the labels before and after shifting
print("Labels before shifting (aligned with future interval):")
print((hot_days_rolling >= 3).astype(int))

print("Labels after shifting (aligned with current day):")
print(labels_next_7_days)

del hot_days

In [None]:
from pympler import muppy, summary
# Filter global variables for numpy arrays and xarray datasets
arrays_and_datasets = {
    name: type(value).__name__
    for name, value in globals().items()
    if type(value).__name__ in ["ndarray", "Dataset", "DataArray"]
}

# Print the results
print(f"{'Variable':<20}{'Type':<20}")
print("=" * 40)
for name, var_type in arrays_and_datasets.items():
    print(f"{name:<20}{var_type:<20}")

# Collect all objects in memory
all_objects = muppy.get_objects()

# Summarize memory usage by type
summary.print_(summary.summarize(all_objects))

## Stacking data of 30 days onto each other and adding labels

In [14]:
#print(data_summer_merged.coords)
#print("Rolling Chunk Days:")
#print(rolling_chunk["day"].values)

#print("\nLabels Next 7 Days:")
#print(labels_next_7_days["day"].values)


In [15]:
# import torch
# # Extract unique years from the 'day' coordinate
# years = np.unique(data_summer_merged["day"].dt.year)
# feature_window_size = 30
# percentile_95_stacked = percentile_95.stack(location=("latitude", "longitude"))

# # Directory for saving temporary results
# processed_dir = "temp_features_by_year"
# os.makedirs(processed_dir, exist_ok=True)

# for year in years:
#     # Select data for the current year
#     yearly_data = data_summer_merged.sel(
#             day=(data_summer_merged["day"].dt.year == year) &
#                  (data_summer_merged["day"].dt.month.isin([5, 6, 7, 8]))
#     )
#     # Create rolling windows
#     rolling_chunk = (
#         yearly_data.rolling(day=feature_window_size, center=False)
#         .construct("feature_dim")
#         .dropna("day")
#     )
    
#     # Align features with labels
#     yearly_labels = labels_next_7_days.sel(day=rolling_chunk["day"])
#     rolling_chunk, yearly_labels = xr.align(rolling_chunk, yearly_labels, join="inner")

    
#     stacked_features = rolling_chunk.stack(location=("latitude", "longitude"))
#     flattened_features = stacked_features.to_array(dim="variables").stack(features=("variables", "feature_dim")).transpose("day", "location", "features")
#     X = flattened_features.values.reshape(flattened_features.shape[0] * flattened_features.shape[1], -1)

#     # Extract latitude, longitude, and add percentile_95
#     latitudes = stacked_features["latitude"].values
#     longitudes = stacked_features["longitude"].values

#     # Add `percentile_95` to each location
#     percentile_95_values = percentile_95_stacked.sel(location=stacked_features["location"]).values
#     repeated_percentile_95 = np.repeat(percentile_95_values, len(flattened_features["day"]), axis=0)

#     # Repeat latitude and longitude for all records (days × locations)
#     repeated_latitudes = np.tile(latitudes, len(flattened_features["day"]))
#     repeated_longitudes = np.tile(longitudes, len(flattened_features["day"]))

#     # Concatenate latitude, longitude, and percentile_95 to the features
#     lat_long_percentile = np.column_stack([repeated_latitudes, repeated_longitudes, repeated_percentile_95])
#     X = np.hstack((X, lat_long_percentile))  # Add these features to the rolling features


#     # Check the resulting shape
#     #stacked_labels = yearly_labels.stack(location=("latitude", "longitude"))
#     #flattened_labels = stacked_labels.to_array(dim="variables").stack(features=("variables", "feature_dim")).transpose("day", "location", "features")
#     #aligned_labels = flattened_labels.sel(day=stacked_features["day"])
#     #y = aligned_labels.values.flatten()  # Flatten into a single column
    
#     stacked_labels = yearly_labels.stack(location=("latitude", "longitude"))
#     y = stacked_labels.values.flatten()  # Flatten into a single column

#     # Save to disk
#     torch.save(torch.tensor(X, dtype=torch.float32), f"{processed_dir}/features_{year}.pt")
#     torch.save(torch.tensor(y, dtype=torch.float32), f"{processed_dir}/labels_{year}.pt")
    
# print("Shape of flattened features (X_train):", X.shape)
# print("Shape of flattened labels (y_train):", y.shape)

# del data_summer_merged, hot_days_rolling

## Train/Test split and flattening the arrays so they can be processed by NN
The final array will be 2D. Each record corresponds with one location and one day. This record contains the data of the previous 30 days

In [16]:
processed_dir = "temp_features_by_year"

# Load and train incrementally
def load_data_for_year(year):
    X = torch.load(f"{processed_dir}/features_{year}.pt")
    y = torch.load(f"{processed_dir}/labels_{year}.pt")    
    return X, y

# Initialize DataLoader for incremental training
batch_size = 32

In [None]:
X_train, y_train = load_data_for_year(2020)
print(X_train.shape[1])
print(y_train.shape)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import torch
import torch.nn as nn
from torch.optim import Adam

years = np.linspace(2006,2024,24-6+1).astype(int)
train_years = years[years < 2020]
test_years = years[years >= 2020]

# Train Random Forest on the entire training set
X_train, y_train = [], []  # Collect all training data for Random Forest

for year in train_years:
    X_train_year, y_train_year = load_data_for_year(year)
    print(f"Year {year}: Features shape: {X_train_year.shape}")

    X_train.append(X_train_year.numpy())  # Convert to numpy for sklearn
    y_train.append(y_train_year.numpy())

# Concatenate yearly data for Random Forest
X_train = np.vstack(X_train)
y_train = np.hstack(y_train)
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.int32)

# Compute global mean and standard deviation from the full training set
global_mean = np.mean(X_train, axis=0)
global_std = np.std(X_train, axis=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
# Train Random Forest
lr_model = LogisticRegression(
    penalty="l2",  # Regularization
    C=1.0,         # Regularization strength (1.0 is default)
    solver="lbfgs", # Solver for large datasets
    #solver="saga"
    max_iter=500,  # Increase iterations for convergence
    random_state=42,
    n_jobs=-1
)


lr_model.fit(X_train, y_train)

In [None]:
# Evaluate Random Forest on the test set
X_test, y_test = [], []

for year in test_years:
    X_val_year, y_val_year = load_data_for_year(year)
    X_test.append(X_val_year.numpy())
    y_test.append(y_val_year.numpy())

# Concatenate test data
X_test = np.vstack(X_test)
y_test = np.hstack(y_test)

X_test = scaler.transform(X_test)

# Predictions
lr_predictions = lr_model.predict(X_test)

# Evaluate the Random Forest
accuracy = accuracy_score(y_test, lr_predictions)
precision = precision_score(y_test, lr_predictions)
recall = recall_score(y_test, lr_predictions)
f1 = f1_score(y_test, lr_predictions)

print("\nRandom Forest Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [30]:
def normalize_features(features, mean, std):
    """
    Normalize features using the given mean and standard deviation.

    Args:
        features (torch.Tensor or np.ndarray): The feature data to normalize.
        mean (torch.Tensor or float): The global mean for each feature.
        std (torch.Tensor or float): The global standard deviation for each feature.

    Returns:
        torch.Tensor: Normalized features.
    """
    return (features - mean) / std


In [None]:
model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train, y_train)

In [None]:
unique, counts = np.unique(y_train_rf, return_counts=True)
print(f"Class distribution in training data: {dict(zip(unique, counts))}")


In [None]:
import matplotlib.pyplot as plt

# Create the plot
fig, ax1 = plt.subplots(figsize=(15, 7))

# Plot training loss on the primary y-axis
ax1.plot(range(1, len(train_losses) + 1), train_losses, label="Training Loss", color="blue", marker="o")
ax1.set_xlabel("Epochs")
ax1.set_ylabel("Training Loss", color="blue")
ax1.tick_params(axis='y', labelcolor="blue")

# Plot validation loss on the secondary y-axis
ax2 = ax1.twinx()
ax2.plot(range(1, len(val_losses) + 1), val_losses, label="Validation Loss", color="red", marker="o")
ax2.set_ylabel("Validation Loss", color="red")
ax2.tick_params(axis='y', labelcolor="red")

# Add a title
plt.title("Training and Validation Loss with Separate Y-Axes")

# Save the plot as a high-resolution PNG
plt.savefig("training_validation_loss_Iteration4.png", dpi=300, bbox_inches="tight")  # 300 DPI for high resolution

# Show the plot
plt.show()

### Using different NN and other models to train and test

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Evaluate on test data
model.eval()
all_predictions = []
all_labels = []

for year in test_years:
    X_test, y_test = load_data_for_year(year)
    
    # Convert to PyTorch tensors
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)

    # Normalize features
    X_test = (X_test - X_test.mean(dim=0)) / X_test.std(dim=0)

    # Predict
    with torch.no_grad():
        predictions = model(X_test).squeeze()
        all_predictions.append((predictions > 0.5).float().numpy())
        all_labels.append(y_test.numpy())

# Combine results and evaluate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)

accuracy = accuracy_score(all_labels, all_predictions)
precision = precision_score(all_labels, all_predictions)
recall = recall_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
positive_rate = torch.sum(y_train == 1).item() / y_train.size(0)
print(f"Positive rate: {positive_rate}")

In [None]:
# Define a dataset and data loader
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model
class HotDayPredictor(nn.Module):
    def __init__(self, input_size):
        super(HotDayPredictor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.fc(x)

# Initialize the model
input_size = X_train.shape[1]
model = HotDayPredictor(input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

# Train the model
for epoch in range(30):
    for batch in dataloader:
        features, labels = batch
        predictions = model(features)
        loss = loss_fn(predictions.squeeze(), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming test_features and test_labels are prepared as PyTorch tensors
X_test = flattened_features_test.values.reshape(flattened_features_test.shape[0] * flattened_features_test.shape[1], -1)
y_test = aligned_labels_test.values.flatten()  # Flatten into a single column
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Put the model in evaluation mode
model.eval()

# Make predictions on the test set
with torch.no_grad():
    predictions = model(X_test).squeeze()
    predicted_classes = (predictions > 0.5).float()  # Convert probabilities to binary classes

# Calculate accuracy
accuracy = accuracy_score(y_test.numpy(), predicted_classes.numpy())
precision = precision_score(y_test.numpy(), predicted_classes.numpy())
recall = recall_score(y_test.numpy(), predicted_classes.numpy())
f1 = f1_score(y_test.numpy(), predicted_classes.numpy())
print("Initial NN")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
# Define the model
class DeeperHotDayPredictor(nn.Module):
    def __init__(self, input_size):
        super(DeeperHotDayPredictor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.fc(x)


# Initialize the model
input_size = X_train.shape[1]
model = DeeperHotDayPredictor(input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

# Train the model
for epoch in range(30):
    for batch in dataloader:
        features, labels = batch
        predictions = model(features)
        loss = loss_fn(predictions.squeeze(), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming test_features and test_labels are prepared as PyTorch tensors
X_test = flattened_features_test.values.reshape(flattened_features_test.shape[0] * flattened_features_test.shape[1], -1)
y_test = aligned_labels_test.values.flatten()  # Flatten into a single column
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Put the model in evaluation mode
model.eval()

# Make predictions on the test set
with torch.no_grad():
    predictions = model(X_test).squeeze()
    predicted_classes = (predictions > 0.5).float()  # Convert probabilities to binary classes

# Calculate accuracy
accuracy = accuracy_score(y_test.numpy(), predicted_classes.numpy())
precision = precision_score(y_test.numpy(), predicted_classes.numpy())
recall = recall_score(y_test.numpy(), predicted_classes.numpy())
f1 = f1_score(y_test.numpy(), predicted_classes.numpy())
print("Deeper NN")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
# Define the model
class DropoutHotDayPredictor(nn.Module):
    def __init__(self, input_size):
        super(DropoutHotDayPredictor, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.fc(x)


# Initialize the model
input_size = X_train.shape[1]
model = DropoutHotDayPredictor(input_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

# Train the model
for epoch in range(30):
    for batch in dataloader:
        features, labels = batch
        predictions = model(features)
        loss = loss_fn(predictions.squeeze(), labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming test_features and test_labels are prepared as PyTorch tensors
X_test = flattened_features_test.values.reshape(flattened_features_test.shape[0] * flattened_features_test.shape[1], -1)
y_test = aligned_labels_test.values.flatten()  # Flatten into a single column
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Put the model in evaluation mode
model.eval()

# Make predictions on the test set
with torch.no_grad():
    predictions = model(X_test).squeeze()
    predicted_classes = (predictions > 0.5).float()  # Convert probabilities to binary classes

# Calculate accuracy
accuracy = accuracy_score(y_test.numpy(), predicted_classes.numpy())
precision = precision_score(y_test.numpy(), predicted_classes.numpy())
recall = recall_score(y_test.numpy(), predicted_classes.numpy())
f1 = f1_score(y_test.numpy(), predicted_classes.numpy())
print("Dropout NN")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=10)
model.fit(X_train, y_train)

# Assuming test_features and test_labels are prepared as PyTorch tensors
X_test = flattened_features_test.values.reshape(flattened_features_test.shape[0] * flattened_features_test.shape[1], -1)
y_test = aligned_labels_test.values.flatten()  # Flatten into a single column

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test set
predicted_classes = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, predicted_classes)
precision = precision_score(y_test, predicted_classes)
recall = recall_score(y_test, predicted_classes)
f1 = f1_score(y_test, predicted_classes)

print("Random Forest")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
print(data.data_vars)