In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from scipy.io import loadmat
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# --- Data Loading ---
data_path = "/mnt/data/Cp_ts_ROH06_deg023.mat"
mat_data = loadmat(data_path)

# Extract fields
roof_pitch = mat_data['Roof_pitch'].flatten()
sample_frequency = mat_data['Sample_frequency'].flatten()
building_depth = mat_data['Building_depth'].flatten()
building_breadth = mat_data['Building_breadth'].flatten()
building_height = mat_data['Building_height'].flatten()
wind_azimuth = mat_data['Wind_azimuth'].flatten()
wind_pressure_coefficients = mat_data['Wind_pressure_coefficients']

mean_pressure_coefficients = wind_pressure_coefficients.mean(axis=1)

# Data Quality Check
plt.hist(mean_pressure_coefficients, bins=30, color='blue', alpha=0.7)
plt.xlabel("Mean Pressure Coefficient")
plt.ylabel("Frequency")
plt.title("Distribution of Target Variable")
plt.grid(True)
plt.show()

# Ensure each feature has the same length
num_samples = mean_pressure_coefficients.shape[0]
features = pd.DataFrame({
    "Roof_pitch": np.tile(roof_pitch, num_samples // len(roof_pitch)),
    "Sample_frequency": np.tile(sample_frequency, num_samples // len(sample_frequency)),
    "Building_depth": np.tile(building_depth, num_samples // len(building_depth)),
    "Building_breadth": np.tile(building_breadth, num_samples // len(building_breadth)),
    "Building_height": np.tile(building_height, num_samples // len(building_height)),
    "Wind_azimuth": np.tile(wind_azimuth, num_samples // len(wind_azimuth)),
})
features["Mean_pressure_coefficient"] = mean_pressure_coefficients

# --- Physics-Informed Features ---
# Constants for air properties
air_density = 1.225  # kg/m^3
dynamic_viscosity = 1.81e-5  # Pa·s
wind_speed = 10.0  # m/s (assume normalized constant for wind tunnel)

# Calculate Reynolds number
features["Re_Breadth"] = (
        air_density * wind_speed * features["Building_breadth"] / dynamic_viscosity
)
features["Re_Height"] = (
        air_density * wind_speed * features["Building_height"] / dynamic_viscosity
)
features["Re_Depth"] = (
        air_density * wind_speed * features["Building_depth"] / dynamic_viscosity
)

# Add eaves-specific parameters
features["Eaves_angle"] = 26.7  # degrees, from provided data
features["Eaves_curvature"] = 0.0  # Assume flat eaves; modify if curvature data available

# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(features.drop(columns=["Mean_pressure_coefficient"]))
feature_names = poly.get_feature_names_out(features.drop(columns=["Mean_pressure_coefficient"]).columns)
poly_df = pd.DataFrame(poly_features, columns=feature_names)
poly_df["Mean_pressure_coefficient"] = mean_pressure_coefficients

# --- Train-Test Split ---
X = poly_df.drop(columns=["Mean_pressure_coefficient"])
y = poly_df["Mean_pressure_coefficient"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- XGBoost Model ---
params = {
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=params, n_iter=50, cv=3, scoring='r2', verbose=1, random_state=42)
random_search.fit(X_train_scaled, y_train)

best_xgb = random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test_scaled)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mape_xgb = mean_absolute_percentage_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost - Best Params: {random_search.best_params_}")
print(f"XGBoost - MAE: {mae_xgb:.4f}, MAPE: {mape_xgb:.4f}, MSE: {mse_xgb:.4f}, R²: {r2_xgb:.4f}")

# --- GAN for Data Augmentation ---
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim)
        )

    def forward(self, x):
        return self.net(x)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# GAN Training Loop
def train_gan(generator, discriminator, data_loader, num_epochs=500, lr=0.0002):
    criterion = nn.BCELoss()
    optim_g = torch.optim.Adam(generator.parameters(), lr=lr)
    optim_d = torch.optim.Adam(discriminator.parameters(), lr=lr)

    for epoch in range(num_epochs):
        for real_data in data_loader:
            real_data = real_data.to(device)
            batch_size = real_data.size(0)

            # Train Discriminator
            optim_d.zero_grad()
            real_labels = torch.ones(batch_size, 1).to(device)
            fake_labels = torch.zeros(batch_size, 1).to(device)

            outputs = discriminator(real_data)
            d_loss_real = criterion(outputs, real_labels)

            noise = torch.randn(batch_size, real_data.size(1)).to(device)
            fake_data = generator(noise)
            outputs = discriminator(fake_data.detach())
            d_loss_fake = criterion(outputs, fake_labels)

            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            optim_d.step()

            # Train Generator
            optim_g.zero_grad()
            outputs = discriminator(fake_data)
            g_loss = criterion(outputs, real_labels)
            g_loss.backward()
            optim_g.step()

        if (epoch + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], D Loss: {d_loss.item():.4f}, G Loss: {g_loss.item():.4f}")

# Train GAN
input_dim = X_train_scaled.shape[1]
generator = Generator(input_dim, input_dim).to(device)
discriminator = Discriminator(input_dim).to(device)

train_dataset = torch.tensor(X_train_scaled, dtype=torch.float32)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

train_gan(generator, discriminator, train_loader)

# Generate synthetic data
noise = torch.randn(1000, input_dim).to(device)
synthetic_data = generator(noise).detach().cpu().numpy()

# Combine real and synthetic data
X_train_augmented = np.vstack([X_train_scaled, synthetic_data])
y_train_augmented = np.hstack([y_train, np.random.choice(y_train, size=1000)])

# Retrain XGBoost on augmented data
best_xgb.fit(X_train_augmented, y_train_augmented)
y_pred_augmented = best_xgb.predict(X_test_scaled)

mae_augmented = mean_absolute_error(y_test, y_pred_augmented)
mape_augmented = mean_absolute_percentage_error(y_test, y_pred_augmented)
mse_augmented = mean_squared_error(y_test, y_pred_augmented)
r2_augmented = r2_score(y_test, y_pred_augmented)

print(f"Augmented XGBoost - MAE: {mae_augmented:.4f}, MAPE: {mape_augmented:.4f}, MSE: {mse_augmented:.4f}, R²: {r2_augmented:.4f}")


Using device: cuda


KeyError: 'Wind_speed'

Using device: cuda
Empty DataFrame
Columns: []
Index: []
