ChatGPT

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize
from skopt.space import Integer, Real
from skopt.utils import use_named_args

# ===============================
# 1. Load Dataset
# ===============================
df = pd.read_csv("your_dataset.csv")  # <<< PUT YOUR FILE HERE

target_col = "your_target_column"  # <<< PUT YOUR TARGET COLUMN NAME
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols].values
y = df[target_col].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# ===============================
# 2. Define Neural Net Class
# ===============================
class DynamicNet(nn.Module):
    def __init__(self, input_dim, hidden_layers, hidden_units, dropout, output_dim=1):
        super(DynamicNet, self).__init__()
        layers = []
        prev_dim = input_dim
        for _ in range(hidden_layers):
            layers.append(nn.Linear(prev_dim, hidden_units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            prev_dim = hidden_units
        layers.append(nn.Linear(prev_dim, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

# ===============================
# 3. Training & Evaluation
# ===============================
def train_and_eval(hidden_layers, hidden_units, dropout, lr, epochs=40):
    model = DynamicNet(X_train.shape[1], hidden_layers, hidden_units, dropout)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()  # Change to BCEWithLogitsLoss if classification

    for _ in range(epochs):
        model.train()
        optimizer.zero_grad()
        preds = model(X_train)
        loss = loss_fn(preds, y_train)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        preds = model(X_test)
        val_loss = loss_fn(preds, y_test).item()
    return val_loss, model

# ===============================
# 4. Search Space for NAS
# ===============================
space = [
    Integer(1, 4, name="hidden_layers"),         # number of hidden layers
    Integer(16, 256, name="hidden_units"),       # neurons per layer
    Real(0.0, 0.5, name="dropout"),              # dropout rate
    Real(1e-4, 1e-2, prior="log-uniform", name="lr")  # learning rate
]

@use_named_args(space)
def objective(hidden_layers, hidden_units, dropout, lr):
    val_loss, _ = train_and_eval(hidden_layers, hidden_units, dropout, lr)
    return val_loss

# ===============================
# 5. Run Bayesian Optimization
# ===============================
print("🔎 Running Neural Architecture Search...")
result = gp_minimize(objective, space, n_calls=20, random_state=42)

best_hl, best_hu, best_do, best_lr = result.x
print(f"\n✅ Best NAS config -> Layers: {best_hl}, Units: {best_hu}, Dropout: {best_do:.2f}, LR: {best_lr:.5f}")

# ===============================
# 6. Train Final Model
# ===============================
final_loss, best_model = train_and_eval(best_hl, best_hu, best_do, best_lr, epochs=100)
print(f"🔥 Final Model Validation Loss: {final_loss:.4f}")

# ===============================
# 7. Save the Model
# ===============================
torch.save(best_model.state_dict(), "best_nas_model.pth")
print("📦 Best model saved as best_nas_model.pth")


Claude

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import time
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
import json
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Architecture Configuration
@dataclass
class ArchConfig:
    """Configuration for a neural architecture"""
    sequence_length: int
    input_features: int
    hidden_dims: List[int]
    layer_types: List[str]  # ['lstm', 'gru', 'conv1d', 'linear']
    attention_heads: Optional[int]
    dropout_rates: List[float]
    use_skip_connections: bool
    activation: str  # 'relu', 'tanh', 'gelu'
    output_dim: int

class AttentionLayer(nn.Module):
    """Multi-head attention for time series"""
    def __init__(self, input_dim, num_heads=8):
        super().__init__()
        self.num_heads = num_heads
        self.input_dim = input_dim
        self.head_dim = input_dim // num_heads
        
        assert input_dim % num_heads == 0, "input_dim must be divisible by num_heads"
        
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.output = nn.Linear(input_dim, input_dim)
        
    def forward(self, x):
        batch_size, seq_len, _ = x.shape
        
        Q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)
        attention = F.softmax(scores, dim=-1)
        
        context = torch.matmul(attention, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.input_dim)
        
        return self.output(context)

class ArchitectureSearchSpace:
    """Defines the search space for NAS"""
    
    @staticmethod
    def get_search_space():
        return {
            'hidden_dims': [
                [64, 32],
                [128, 64],
                [256, 128],
                [128, 128, 64],
                [256, 128, 64],
                [512, 256, 128],
                [256, 256, 128, 64]
            ],
            'layer_types': [
                ['lstm', 'linear'],
                ['gru', 'linear'],
                ['conv1d', 'lstm', 'linear'],
                ['lstm', 'lstm', 'linear'],
                ['conv1d', 'gru', 'linear'],
                ['lstm', 'attention', 'linear'],
                ['conv1d', 'lstm', 'attention', 'linear']
            ],
            'attention_heads': [None, 4, 8, 16],
            'dropout_rates': [
                [0.1, 0.1],
                [0.2, 0.2],
                [0.3, 0.1],
                [0.1, 0.2, 0.1],
                [0.2, 0.3, 0.1],
                [0.1, 0.1, 0.2, 0.1]
            ],
            'use_skip_connections': [True, False],
            'activation': ['relu', 'tanh', 'gelu']
        }

class DynamicINSModel(nn.Module):
    """Dynamic model that can be configured based on architecture config"""
    
    def __init__(self, config: ArchConfig):
        super().__init__()
        self.config = config
        self.layers = nn.ModuleList()
        self.skip_connections = []
        
        # Build the architecture
        current_dim = config.input_features
        
        for i, (layer_type, hidden_dim, dropout_rate) in enumerate(
            zip(config.layer_types, config.hidden_dims, config.dropout_rates)
        ):
            if layer_type == 'lstm':
                layer = nn.LSTM(
                    input_size=current_dim,
                    hidden_size=hidden_dim,
                    batch_first=True,
                    dropout=dropout_rate if i < len(config.layer_types) - 1 else 0
                )
                self.layers.append(layer)
                current_dim = hidden_dim
                
            elif layer_type == 'gru':
                layer = nn.GRU(
                    input_size=current_dim,
                    hidden_size=hidden_dim,
                    batch_first=True,
                    dropout=dropout_rate if i < len(config.layer_types) - 1 else 0
                )
                self.layers.append(layer)
                current_dim = hidden_dim
                
            elif layer_type == 'conv1d':
                layer = nn.Sequential(
                    nn.Conv1d(current_dim, hidden_dim, kernel_size=3, padding=1),
                    self._get_activation(),
                    nn.Dropout(dropout_rate)
                )
                self.layers.append(layer)
                current_dim = hidden_dim
                
            elif layer_type == 'attention':
                if config.attention_heads:
                    # Ensure current_dim is divisible by attention_heads
                    if current_dim % config.attention_heads != 0:
                        # Adjust current_dim to be divisible
                        adjustment_layer = nn.Linear(current_dim, 
                                                   (current_dim // config.attention_heads) * config.attention_heads)
                        self.layers.append(adjustment_layer)
                        current_dim = (current_dim // config.attention_heads) * config.attention_heads
                    
                    layer = AttentionLayer(current_dim, config.attention_heads)
                    self.layers.append(layer)
                    
            elif layer_type == 'linear':
                layer = nn.Sequential(
                    nn.Linear(current_dim, hidden_dim),
                    self._get_activation(),
                    nn.Dropout(dropout_rate)
                )
                self.layers.append(layer)
                current_dim = hidden_dim
        
        # Output layer
        self.output_layer = nn.Linear(current_dim, config.output_dim)
        
        # Skip connection compatibility
        if config.use_skip_connections:
            self.skip_projections = nn.ModuleList()
            skip_dim = config.input_features
            for hidden_dim in config.hidden_dims[:-1]:  # Skip the last layer for output
                if skip_dim != hidden_dim:
                    self.skip_projections.append(nn.Linear(skip_dim, hidden_dim))
                else:
                    self.skip_projections.append(nn.Identity())
                skip_dim = hidden_dim
        
    def _get_activation(self):
        if self.config.activation == 'relu':
            return nn.ReLU()
        elif self.config.activation == 'tanh':
            return nn.Tanh()
        elif self.config.activation == 'gelu':
            return nn.GELU()
        else:
            return nn.ReLU()
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length, input_features)
        original_x = x
        skip_x = x
        
        for i, (layer, layer_type) in enumerate(zip(self.layers, self.config.layer_types)):
            if layer_type in ['lstm', 'gru']:
                x, _ = layer(x)
                # Take the last output for sequence prediction
                if i == len(self.layers) - 1 or self.config.layer_types[i + 1] == 'linear':
                    x = x[:, -1, :]  # Take last time step
                    
            elif layer_type == 'conv1d':
                # Conv1d expects (batch_size, features, sequence_length)
                x = x.transpose(1, 2)
                x = layer(x)
                x = x.transpose(1, 2)
                
            elif layer_type == 'attention':
                x = layer(x)
                
            elif layer_type == 'linear':
                # If x is still 3D, take the last time step
                if len(x.shape) == 3:
                    x = x[:, -1, :]
                x = layer(x)
            
            # Skip connections (only for compatible dimensions)
            if (self.config.use_skip_connections and 
                i < len(self.skip_projections) and 
                len(skip_x.shape) == len(x.shape)):
                try:
                    if len(x.shape) == 3:  # Sequence data
                        projected_skip = self.skip_projections[i](skip_x)
                        if projected_skip.shape == x.shape:
                            x = x + projected_skip
                    elif len(x.shape) == 2:  # After taking last time step
                        if len(skip_x.shape) == 3:
                            skip_x = skip_x[:, -1, :]
                        projected_skip = self.skip_projections[i](skip_x)
                        if projected_skip.shape == x.shape:
                            x = x + projected_skip
                    skip_x = x
                except:
                    # Skip connection failed, continue without it
                    pass
        
        # Final output
        if len(x.shape) == 3:
            x = x[:, -1, :]
        
        return self.output_layer(x)

class INSDataset(Dataset):
    """Dataset class for INS data"""
    
    def __init__(self, data, target_column, feature_columns, sequence_length, scaler_X=None, scaler_y=None):
        self.data = data
        self.target_column = target_column
        self.feature_columns = feature_columns
        self.sequence_length = sequence_length
        
        # Prepare features and targets
        X = data[feature_columns].values
        y = data[target_column].values
        
        # Scale the data
        if scaler_X is None:
            self.scaler_X = StandardScaler()
            X = self.scaler_X.fit_transform(X)
        else:
            self.scaler_X = scaler_X
            X = self.scaler_X.transform(X)
            
        if scaler_y is None:
            self.scaler_y = StandardScaler()
            y = self.scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
        else:
            self.scaler_y = scaler_y
            y = self.scaler_y.transform(y.reshape(-1, 1)).flatten()
        
        # Create sequences
        self.sequences, self.targets = self._create_sequences(X, y)
        
    def _create_sequences(self, X, y):
        sequences = []
        targets = []
        
        for i in range(len(X) - self.sequence_length + 1):
            seq = X[i:i + self.sequence_length]
            target = y[i + self.sequence_length - 1]  # Predict the last point in sequence
            sequences.append(seq)
            targets.append(target)
            
        return torch.FloatTensor(sequences), torch.FloatTensor(targets)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

class NASController:
    """Neural Architecture Search Controller"""
    
    def __init__(self, input_features, sequence_length, output_dim=1, device='cuda'):
        self.input_features = input_features
        self.sequence_length = sequence_length
        self.output_dim = output_dim
        self.device = device
        self.search_space = ArchitectureSearchSpace.get_search_space()
        self.architecture_history = []
        self.performance_history = []
        
    def sample_architecture(self) -> ArchConfig:
        """Sample a random architecture from the search space"""
        
        # Sample basic components
        layer_types = random.choice(self.search_space['layer_types'])
        hidden_dims = random.choice([dims for dims in self.search_space['hidden_dims'] 
                                   if len(dims) == len(layer_types)])
        
        # Ensure dropout rates match the number of layers
        dropout_rates = random.choice([rates for rates in self.search_space['dropout_rates']
                                     if len(rates) == len(layer_types)])
        
        # Sample other hyperparameters
        attention_heads = random.choice(self.search_space['attention_heads'])
        use_skip_connections = random.choice(self.search_space['use_skip_connections'])
        activation = random.choice(self.search_space['activation'])
        
        return ArchConfig(
            sequence_length=self.sequence_length,
            input_features=self.input_features,
            hidden_dims=hidden_dims,
            layer_types=layer_types,
            attention_heads=attention_heads,
            dropout_rates=dropout_rates,
            use_skip_connections=use_skip_connections,
            activation=activation,
            output_dim=self.output_dim
        )
    
    def evaluate_architecture(self, config: ArchConfig, train_loader, val_loader, 
                            epochs=20, patience=5) -> Dict:
        """Evaluate a single architecture"""
        
        model = DynamicINSModel(config).to(self.device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
        
        best_val_loss = float('inf')
        patience_counter = 0
        train_losses = []
        val_losses = []
        
        start_time = time.time()
        
        for epoch in range(epochs):
            # Training
            model.train()
            train_loss = 0.0
            train_batches = 0
            
            for batch_x, batch_y in train_loader:
                batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                
                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                
                optimizer.step()
                
                train_loss += loss.item()
                train_batches += 1
            
            avg_train_loss = train_loss / train_batches
            
            # Validation
            model.eval()
            val_loss = 0.0
            val_batches = 0
            
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    batch_x, batch_y = batch_x.to(self.device), batch_y.to(self.device)
                    outputs = model(batch_x)
                    loss = criterion(outputs.squeeze(), batch_y)
                    val_loss += loss.item()
                    val_batches += 1
            
            avg_val_loss = val_loss / val_batches
            
            train_losses.append(avg_train_loss)
            val_losses.append(avg_val_loss)
            
            # Learning rate scheduling
            scheduler.step(avg_val_loss)
            
            # Early stopping
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                
            if patience_counter >= patience:
                break
        
        training_time = time.time() - start_time
        
        # Calculate model parameters
        total_params = sum(p.numel() for p in model.parameters())
        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        
        # Calculate inference time
        model.eval()
        inference_times = []
        with torch.no_grad():
            for _ in range(10):  # Average over 10 runs
                batch_x = next(iter(val_loader))[0][:1].to(self.device)  # Single sample
                start_inf = time.time()
                _ = model(batch_x)
                inference_times.append(time.time() - start_inf)
        
        avg_inference_time = np.mean(inference_times) * 1000  # Convert to ms
        
        results = {
            'val_loss': best_val_loss,
            'train_loss': train_losses[-1],
            'training_time': training_time,
            'inference_time_ms': avg_inference_time,
            'total_params': total_params,
            'trainable_params': trainable_params,
            'epochs_trained': epoch + 1,
            'config': config
        }
        
        return results
    
    def search(self, train_loader, val_loader, num_trials=50, epochs_per_trial=20):
        """Main search loop"""
        
        print(f"Starting NAS with {num_trials} trials...")
        print(f"Device: {self.device}")
        print(f"Input features: {self.input_features}")
        print(f"Sequence length: {self.sequence_length}")
        
        best_architectures = []
        
        for trial in range(num_trials):
            print(f"\n--- Trial {trial + 1}/{num_trials} ---")
            
            # Sample architecture
            config = self.sample_architecture()
            
            print(f"Architecture: {config.layer_types}")
            print(f"Hidden dims: {config.hidden_dims}")
            print(f"Attention heads: {config.attention_heads}")
            print(f"Skip connections: {config.use_skip_connections}")
            
            try:
                # Evaluate architecture
                results = self.evaluate_architecture(
                    config, train_loader, val_loader, 
                    epochs=epochs_per_trial
                )
                
                self.architecture_history.append(config)
                self.performance_history.append(results)
                
                print(f"Val Loss: {results['val_loss']:.4f}")
                print(f"Params: {results['total_params']:,}")
                print(f"Inference time: {results['inference_time_ms']:.2f}ms")
                print(f"Training time: {results['training_time']:.1f}s")
                
                # Keep track of best architectures
                best_architectures.append(results)
                best_architectures.sort(key=lambda x: x['val_loss'])
                best_architectures = best_architectures[:10]  # Keep top 10
                
            except Exception as e:
                print(f"Trial {trial + 1} failed: {str(e)}")
                continue
        
        # Sort results
        self.performance_history.sort(key=lambda x: x['val_loss'])
        
        print(f"\n{'='*50}")
        print("SEARCH COMPLETED!")
        print(f"{'='*50}")
        
        # Display top 5 architectures
        for i, result in enumerate(self.performance_history[:5]):
            config = result['config']
            print(f"\nRank {i+1}:")
            print(f"  Architecture: {config.layer_types}")
            print(f"  Hidden dims: {config.hidden_dims}")
            print(f"  Validation Loss: {result['val_loss']:.4f}")
            print(f"  Parameters: {result['total_params']:,}")
            print(f"  Inference time: {result['inference_time_ms']:.2f}ms")
        
        return self.performance_history
    
    def get_best_architecture(self):
        """Get the best performing architecture"""
        if not self.performance_history:
            return None
        return self.performance_history[0]
    
    def save_results(self, filename):
        """Save search results"""
        # Convert results to serializable format
        serializable_results = []
        for result in self.performance_history:
            config = result['config']
            serializable_result = {
                'val_loss': result['val_loss'],
                'train_loss': result['train_loss'],
                'training_time': result['training_time'],
                'inference_time_ms': result['inference_time_ms'],
                'total_params': result['total_params'],
                'trainable_params': result['trainable_params'],
                'epochs_trained': result['epochs_trained'],
                'config': {
                    'sequence_length': config.sequence_length,
                    'input_features': config.input_features,
                    'hidden_dims': config.hidden_dims,
                    'layer_types': config.layer_types,
                    'attention_heads': config.attention_heads,
                    'dropout_rates': config.dropout_rates,
                    'use_skip_connections': config.use_skip_connections,
                    'activation': config.activation,
                    'output_dim': config.output_dim
                }
            }
            serializable_results.append(serializable_result)
        
        with open(filename, 'w') as f:
            json.dump(serializable_results, f, indent=2)

# ====================================================================
# USER CONFIGURATION SECTION - MODIFY THIS FOR YOUR DATA
# ====================================================================

def configure_your_data():
    """
    Configure your dataset here. Replace this function with your actual data loading.
    
    Returns:
        data (pd.DataFrame): Your dataset
        feature_columns (list): List of column names to use as input features
        target_column (str): Name of the column you want to predict
    """
    
    # METHOD 1: Load from CSV file
    # Uncomment and modify the following lines for CSV loading:
    
    CSV_FILE_PATH = "your_dataset.csv"  # Replace with your CSV file path
    TARGET_COLUMN = "your_target_column"  # Replace with your target column name
    
    # Option A: Specify feature columns manually
    FEATURE_COLUMNS = [
        "feature1", "feature2", "feature3",
        "feature4", "feature5", "feature6", 
        "feature7", "feature8", "feature9"
    ]
    
    # Option B: Use all columns except target as features (uncomment next line)
    # FEATURE_COLUMNS = None  # Will auto-detect all columns except target
    
    data = pd.read_csv(CSV_FILE_PATH)
    
    if FEATURE_COLUMNS is None:
        feature_columns = [col for col in data.columns if col != TARGET_COLUMN]
    else:
        feature_columns = FEATURE_COLUMNS
    
    return data, feature_columns, TARGET_COLUMN
    
    # METHOD 2: Load from DataFrame (if you already have data in memory)
    # Uncomment and modify the following lines:
    
    # # Assume you have your DataFrame ready
    # data = your_existing_dataframe
    # 
    # TARGET_COLUMN = "your_target_column"
    # FEATURE_COLUMNS = ["col1", "col2", "col3", "col4", "col5"]  # Your feature columns
    # 
    # return data, FEATURE_COLUMNS, TARGET_COLUMN
    
    # METHOD 3: Manual data input (for small datasets or testing)
    # Uncomment and modify:
    
    # # Create DataFrame manually
    # data_dict = {
    #     'feature1': [1, 2, 3, 4, 5, ...],  # Your actual data
    #     'feature2': [1.1, 2.2, 3.3, 4.4, 5.5, ...],
    #     'feature3': [0.1, 0.2, 0.3, 0.4, 0.5, ...],
    #     'target': [10, 20, 30, 40, 50, ...]  # Your target values
    # }
    # 
    # data = pd.DataFrame(data_dict)
    # feature_columns = ['feature1', 'feature2', 'feature3']
    # target_column = 'target'
    # 
    # return data, feature_columns, target_column


# ====================================================================
# HYPERPARAMETER CONFIGURATION
# ====================================================================

def get_training_config():
    """
    Configure training hyperparameters here.
    Modify these values based on your dataset and computational resources.
    """
    
    config = {
        # Data parameters
        'SEQUENCE_LENGTH': 50,  # How many time steps to look back for prediction
        'BATCH_SIZE': 32,       # Training batch size (reduce if memory issues)
        
        # NAS parameters
        'NUM_TRIALS': 25,        # Number of architectures to try (increase for better results)
        'EPOCHS_PER_TRIAL': 15,  # Training epochs per architecture (balance speed vs accuracy)
        
        # Training parameters
        'FINAL_TRAINING_EPOCHS': 100,  # Epochs for final best model training
        'LEARNING_RATE': 0.001,        # Initial learning rate
        'PATIENCE': 10,                # Early stopping patience
        
        # Data split
        'TRAIN_SPLIT': 0.7,      # Training data ratio
        'VALIDATION_SPLIT': 0.2, # Validation data ratio
        'TEST_SPLIT': 0.1,       # Test data ratio (remaining)
        
        # Advanced options
        'USE_EARLY_STOPPING': True,
        'USE_GRADIENT_CLIPPING': True,
        'GRADIENT_CLIP_VALUE': 1.0,
        'USE_SCHEDULER': True,
        'SCHEDULER_PATIENCE': 5,
        
        # Ensemble options
        'CREATE_ENSEMBLE': True,
        'ENSEMBLE_TOP_K': 3,  # Number of best models to ensemble,
        
        # Your specific configuration
        'YOUR_TARGET_COLUMN': 'your_target_column',  # Set this to your target column name
        'YOUR_FEATURE_COLUMNS': ['feature1', 'feature2', 'feature3']  # Set this to your feature columns
    }
    
    return config


# ====================================================================
# MAIN EXECUTION FUNCTION
# ====================================================================

def main():
    """Main function to run NAS for INS data"""
    
    print("="*60)
    print("NEURAL ARCHITECTURE SEARCH FOR INS DATA PREDICTION")
    print("="*60)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Get configuration
    config = get_training_config()
    print(f"\nTraining Configuration:")
    for key, value in config.items():
        print(f"  {key}: {value}")
    
    # Load your data
    print("\n" + "="*40)
    print("LOADING DATA")
    print("="*40)
    
    try:
        data, feature_columns, target_column = configure_your_data()
        
        # Override with your specific configuration if provided
        if 'YOUR_TARGET_COLUMN' in config and config['YOUR_TARGET_COLUMN'] != 'your_target_column':
            target_column = config['YOUR_TARGET_COLUMN']
        if 'YOUR_FEATURE_COLUMNS' in config and config['YOUR_FEATURE_COLUMNS'] != ['feature1', 'feature2', 'feature3']:
            feature_columns = config['YOUR_FEATURE_COLUMNS']
        
        print(f"✅ Data loaded successfully!")
        print(f"📊 Dataset shape: {data.shape}")
        print(f"🎯 Target column: '{target_column}'")
        print(f"📈 Feature columns ({len(feature_columns)}): {feature_columns}")
        
        # Display first few rows
        print(f"\n📋 First 5 rows of data:")
        print(data[feature_columns + [target_column]].head())
        
        # Check for missing values
        missing_values = data[feature_columns + [target_column]].isnull().sum()
        if missing_values.any():
            print(f"⚠️  Missing values detected:")
            print(missing_values[missing_values > 0])
            print("Filling missing values with forward fill...")
            data = data.fillna(method='ffill').fillna(method='bfill')
        
        # Basic statistics
        print(f"\n📊 Data Statistics:")
        print(data[feature_columns + [target_column]].describe())
        
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        print("Please check your data configuration in the configure_your_data() function.")
        return
    
    # Split data temporally (important for time series)
    print(f"\n" + "="*40)
    print("PREPARING DATA")
    print("="*40)
    
    # Calculate split indices
    n_samples = len(data)
    train_end = int(config['TRAIN_SPLIT'] * n_samples)
    val_end = int((config['TRAIN_SPLIT'] + config['VALIDATION_SPLIT']) * n_samples)
    
    train_data = data[:train_end]
    val_data = data[train_end:val_end] 
    test_data = data[val_end:]
    
    print(f"📊 Data splits:")
    print(f"  Training: {len(train_data)} samples ({len(train_data)/n_samples*100:.1f}%)")
    print(f"  Validation: {len(val_data)} samples ({len(val_data)/n_samples*100:.1f}%)")
    print(f"  Test: {len(test_data)} samples ({len(test_data)/n_samples*100:.1f}%)")
    
    if len(test_data) == 0:
        print("⚠️  Warning: No test data available. Using validation data for final testing.")
        test_data = val_data
    
    # Create datasets
    print(f"🔧 Creating datasets with sequence length: {config['SEQUENCE_LENGTH']}")
    
    try:
        train_dataset = INSDataset(train_data, target_column, feature_columns, config['SEQUENCE_LENGTH'])
        
        # Use the same scalers for validation and test data
        val_dataset = INSDataset(val_data, target_column, feature_columns, config['SEQUENCE_LENGTH'],
                                scaler_X=train_dataset.scaler_X, scaler_y=train_dataset.scaler_y)
        
        test_dataset = INSDataset(test_data, target_column, feature_columns, config['SEQUENCE_LENGTH'],
                                 scaler_X=train_dataset.scaler_X, scaler_y=train_dataset.scaler_y)
        
        print(f"✅ Datasets created successfully!")
        print(f"  Train sequences: {len(train_dataset)}")
        print(f"  Validation sequences: {len(val_dataset)}")
        print(f"  Test sequences: {len(test_dataset)}")
        
    except Exception as e:
        print(f"❌ Error creating datasets: {e}")
        print("This might be due to insufficient data for the specified sequence length.")
        print(f"Try reducing SEQUENCE_LENGTH (currently {config['SEQUENCE_LENGTH']}) in get_training_config().")
        return
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=config['BATCH_SIZE'], shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=config['BATCH_SIZE'], shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=config['BATCH_SIZE'], shuffle=False, num_workers=0)
    
    print(f"🔧 Data loaders created with batch size: {config['BATCH_SIZE']}")
    
    # Initialize NAS controller
    print(f"\n" + "="*40)
    print("INITIALIZING NEURAL ARCHITECTURE SEARCH")
    print("="*40)
    
    nas_controller = NASController(
        input_features=len(feature_columns),
        sequence_length=config['SEQUENCE_LENGTH'],
        output_dim=1,
        device=device
    )
    
    print(f"🧠 NAS Controller initialized")
    print(f"  Input features: {len(feature_columns)}")
    print(f"  Sequence length: {config['SEQUENCE_LENGTH']}")
    print(f"  Search trials: {config['NUM_TRIALS']}")
    print(f"  Epochs per trial: {config['EPOCHS_PER_TRIAL']}")
    
    # Run search
    print(f"\n" + "="*40)
    print("RUNNING ARCHITECTURE SEARCH")
    print("="*40)
    
    results = nas_controller.search(
        train_loader=train_loader,
        val_loader=val_loader,
        num_trials=config['NUM_TRIALS'],
        epochs_per_trial=config['EPOCHS_PER_TRIAL']
    )
    
    # Save results
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    results_filename = f'nas_results_{timestamp}.json'
    nas_controller.save_results(results_filename)
    print(f"\n💾 Results saved to '{results_filename}'")
    
    # Get and train the best architecture
    best_result = nas_controller.get_best_architecture()
    if not best_result:
        print("❌ No valid architectures found. Please check your data and try again.")
        return
    
    print(f"\n" + "="*40)
    print("TRAINING BEST ARCHITECTURE")
    print("="*40)
    
    best_config = best_result['config']
    print(f"🏆 Best architecture found:")
    print(f"  Layer types: {best_config.layer_types}")
    print(f"  Hidden dims: {best_config.hidden_dims}")
    print(f"  Validation loss: {best_result['val_loss']:.4f}")
    print(f"  Parameters: {best_result['total_params']:,}")
    
    # Train the best model for more epochs
    print(f"\n🚀 Training for {config['FINAL_TRAINING_EPOCHS']} epochs...")
    
    best_model = DynamicINSModel(best_config).to(device)
    optimizer = optim.Adam(best_model.parameters(), lr=config['LEARNING_RATE'])
    criterion = nn.MSELoss()
    
    if config['USE_SCHEDULER']:
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=config['SCHEDULER_PATIENCE']
        )
    
    # Extended training
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    
    for epoch in range(config['FINAL_TRAINING_EPOCHS']):
        # Training phase
        best_model.train()
        epoch_train_loss = 0.0
        train_batches = 0
        
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = best_model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y)
            loss.backward()
            
            if config['USE_GRADIENT_CLIPPING']:
                torch.nn.utils.clip_grad_norm_(best_model.parameters(), 
                                             max_norm=config['GRADIENT_CLIP_VALUE'])
            
            optimizer.step()
            
            epoch_train_loss += loss.item()
            train_batches += 1
        
        avg_train_loss = epoch_train_loss / train_batches
        
        # Validation phase
        best_model.eval()
        epoch_val_loss = 0.0
        val_batches = 0
        
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = best_model(batch_x)
                loss = criterion(outputs.squeeze(), batch_y)
                epoch_val_loss += loss.item()
                val_batches += 1
        
        avg_val_loss = epoch_val_loss / val_batches
        
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        if config['USE_SCHEDULER']:
            scheduler.step(avg_val_loss)
        
        # Early stopping and model saving
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model_filename = f'best_ins_model_{timestamp}.pth'
            torch.save(best_model.state_dict(), model_filename)
            patience_counter = 0
        else:
            patience_counter += 1
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1:3d}/{config['FINAL_TRAINING_EPOCHS']}] - "
                  f"Train: {avg_train_loss:.4f}, Val: {avg_val_loss:.4f}, "
                  f"Best Val: {best_val_loss:.4f}")
        
        # Early stopping
        if config['USE_EARLY_STOPPING'] and patience_counter >= config['PATIENCE']:
            print(f"\n⏹️  Early stopping triggered after {epoch + 1} epochs")
            break
    
    print(f"\n✅ Training completed!")
    print(f"💾 Best model saved as '{model_filename}'")
    print(f"🏆 Final best validation loss: {best_val_loss:.4f}")
    
    # Model evaluation on test set
    print(f"\n" + "="*40)
    print("FINAL MODEL EVALUATION")
    print("="*40)
    
    best_model.eval()
    test_predictions = []
    test_actuals = []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = best_model(batch_x)
            
            # Inverse transform predictions and actuals
            pred_np = outputs.cpu().numpy().reshape(-1, 1)
            actual_np = batch_y.cpu().numpy().reshape(-1, 1)
            
            pred_original = test_dataset.scaler_y.inverse_transform(pred_np).flatten()
            actual_original = test_dataset.scaler_y.inverse_transform(actual_np).flatten()
            
            test_predictions.extend(pred_original)
            test_actuals.extend(actual_original)
    
    test_predictions = np.array(test_predictions)
    test_actuals = np.array(test_actuals)
    
    # Calculate metrics
    mae = np.mean(np.abs(test_predictions - test_actuals))
    rmse = np.sqrt(np.mean((test_predictions - test_actuals) ** 2))
    r2 = 1 - (np.sum((test_actuals - test_predictions) ** 2) / 
              np.sum((test_actuals - np.mean(test_actuals)) ** 2))
    
    print(f"📊 Final Test Performance:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")
    
    # Create ensemble if requested
    ensemble_filename = None
    if config['CREATE_ENSEMBLE'] and len(results) >= config['ENSEMBLE_TOP_K']:
        print(f"\n" + "="*40)
        print(f"CREATING ENSEMBLE MODEL (Top {config['ENSEMBLE_TOP_K']})")
        print("="*40)
        
        ensemble_model = create_ensemble_model(results, config['ENSEMBLE_TOP_K'])
        ensemble_model = ensemble_model.to(device)
        
        # Evaluate ensemble
        ensemble_model.eval()
        ensemble_predictions = []
        
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                batch_x = batch_x.to(device)
                outputs = ensemble_model(batch_x)
                pred_np = outputs.cpu().numpy().reshape(-1, 1)
                pred_original = test_dataset.scaler_y.inverse_transform(pred_np).flatten()
                ensemble_predictions.extend(pred_original)
        
        ensemble_predictions = np.array(ensemble_predictions)
        
        # Ensemble metrics
        ensemble_mae = np.mean(np.abs(ensemble_predictions - test_actuals))
        ensemble_rmse = np.sqrt(np.mean((ensemble_predictions - test_actuals) ** 2))
        ensemble_r2 = 1 - (np.sum((test_actuals - ensemble_predictions) ** 2) / 
                          np.sum((test_actuals - np.mean(test_actuals)) ** 2))
        
        print(f"📊 Ensemble Performance:")
        print(f"  MAE:  {ensemble_mae:.4f}")
        print(f"  RMSE: {ensemble_rmse:.4f}")
        print(f"  R²:   {ensemble_r2:.4f}")
        
        # Save ensemble model
        ensemble_filename = f'ensemble_model_{timestamp}.pth'
        torch.save(ensemble_model.state_dict(), ensemble_filename)
        print(f"💾 Ensemble model saved as '{ensemble_filename}'")
    
    # Create visualization plots
    print(f"\n" + "="*40)
    print("CREATING VISUALIZATIONS")
    print("="*40)
    
    try:
        plot_filename = f'nas_results_visualization_{timestamp}.png'
        
        plt.figure(figsize=(18, 6))
        
        # Plot 1: Actual vs Predicted
        plt.subplot(1, 3, 1)
        sample_size = min(500, len(test_actuals))
        plt.scatter(test_actuals[:sample_size], test_predictions[:sample_size], alpha=0.6, s=10)
        min_val, max_val = min(test_actuals.min(), test_predictions.min()), max(test_actuals.max(), test_predictions.max())
        plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label='Perfect Prediction')
        plt.xlabel(f'Actual {target_column}')
        plt.ylabel(f'Predicted {target_column}')
        plt.title(f'Actual vs Predicted\n(R² = {r2:.3f})')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot 2: Time series comparison
        plt.subplot(1, 3, 2)
        comparison_length = min(200, len(test_actuals))
        time_steps = range(comparison_length)
        plt.plot(time_steps, test_actuals[:comparison_length], label='Actual', linewidth=2)
        plt.plot(time_steps, test_predictions[:comparison_length], label='Predicted', linewidth=2, alpha=0.8)
        plt.xlabel('Time Steps')
        plt.ylabel(f'{target_column} Value')
        plt.title(f'Time Series Comparison\n(MAE = {mae:.3f})')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot 3: Error distribution
        plt.subplot(1, 3, 3)
        errors = test_predictions - test_actuals
        plt.hist(errors, bins=50, alpha=0.7, edgecolor='black', color='skyblue')
        plt.xlabel('Prediction Error')
        plt.ylabel('Frequency')
        plt.title(f'Error Distribution\n(RMSE = {rmse:.3f})')
        plt.axvline(x=0, color='r', linestyle='--', linewidth=2, label='Zero Error')
        plt.axvline(x=np.mean(errors), color='orange', linestyle='-', linewidth=2, label=f'Mean Error = {np.mean(errors):.3f}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(plot_filename, dpi=300, bbox_inches='tight')
        print(f"📊 Visualizations saved as '{plot_filename}'")
        
        # Try to show the plot
        try:
            plt.show()
        except:
            print("📊 Plot saved but cannot display in this environment")
        
    except Exception as e:
        print(f"⚠️  Could not create visualizations: {e}")
    
    # Final summary
    print(f"\n" + "="*60)
    print("FINAL SUMMARY")
    print("="*60)
    
    print(f"✅ Neural Architecture Search completed successfully!")
    print(f"📊 Dataset: {len(feature_columns)} features, {target_column} target")
    print(f"🔍 Searched {config['NUM_TRIALS']} architectures")
    print(f"🏆 Best architecture performance:")
    print(f"   • Validation Loss: {best_result['val_loss']:.4f}")
    print(f"   • Test MAE: {mae:.4f}")
    print(f"   • Test RMSE: {rmse:.4f}")
    print(f"   • Test R²: {r2:.4f}")
    print(f"   • Model Parameters: {best_result['total_params']:,}")
    print(f"   • Inference Time: {best_result['inference_time_ms']:.2f}ms")
    
    print(f"\n🏗️  Best Architecture:")
    print(f"   • Layers: {best_config.layer_types}")
    print(f"   • Hidden Dimensions: {best_config.hidden_dims}")
    print(f"   • Attention Heads: {best_config.attention_heads}")
    print(f"   • Skip Connections: {best_config.use_skip_connections}")
    print(f"   • Activation: {best_config.activation}")
    
    print(f"\n💾 Files created:")
    print(f"   • Model: {model_filename}")
    print(f"   • Results: {results_filename}")
    print(f"   • Visualization: {plot_filename}")
    if config['CREATE_ENSEMBLE'] and len(results) >= config['ENSEMBLE_TOP_K']:
        print(f"   • Ensemble: {ensemble_filename}")
    
    print(f"\n🚀 To use your trained model:")
    print(f"   model = DynamicINSModel(best_config)")
    print(f"   model.load_state_dict(torch.load('{model_filename}'))")
    
    print(f"\n" + "="*60)
    print("NAS COMPLETED SUCCESSFULLY! 🎉")
    print("="*60)


# ====================================================================
# UTILITY FUNCTIONS FOR LOADING SAVED MODELS
# ====================================================================

def load_trained_model(model_path: str, config_path: str):
    """
    Load a previously trained model from NAS results.
    
    Args:
        model_path: Path to the saved model (.pth file)
        config_path: Path to the NAS results (.json file)
    
    Returns:
        model: Loaded PyTorch model
        config: Architecture configuration
    """
    
    # Load the configuration
    with open(config_path, 'r') as f:
        results = json.load(f)
    
    best_config_dict = results[0]['config']  # Best performing architecture
    
    # Reconstruct ArchConfig
    best_config = ArchConfig(
        sequence_length=best_config_dict['sequence_length'],
        input_features=best_config_dict['input_features'],
        hidden_dims=best_config_dict['hidden_dims'],
        layer_types=best_config_dict['layer_types'],
        attention_heads=best_config_dict['attention_heads'],
        dropout_rates=best_config_dict['dropout_rates'],
        use_skip_connections=best_config_dict['use_skip_connections'],
        activation=best_config_dict['activation'],
        output_dim=best_config_dict['output_dim']
    )
    
    # Load the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = DynamicINSModel(best_config).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    
    return model, best_config


def predict_with_trained_model(model, data, feature_columns, sequence_length, scaler_X, scaler_y):
    """
    Make predictions using a trained model.
    
    Args:
        model: Trained PyTorch model
        data: Input data (DataFrame)
        feature_columns: List of feature column names
        sequence_length: Sequence length used during training
        scaler_X: Fitted feature scaler from training
        scaler_y: Fitted target scaler from training
    
    Returns:
        predictions: Predicted values (original scale)
    """
    
    device = next(model.parameters()).device
    
    # Prepare input data
    X = data[feature_columns].values
    X_scaled = scaler_X.transform(X)
    
    # Create sequences
    sequences = []
    for i in range(len(X_scaled) - sequence_length + 1):
        seq = X_scaled[i:i + sequence_length]
        sequences.append(seq)
    
    if not sequences:
        raise ValueError(f"Not enough data for sequence length {sequence_length}")
    
    X_tensor = torch.FloatTensor(sequences).to(device)
    
    # Make predictions
    model.eval()
    predictions = []
    
    with torch.no_grad():
        batch_size = 32  # Process in batches to avoid memory issues
        for i in range(0, len(X_tensor), batch_size):
            batch = X_tensor[i:i + batch_size]
            outputs = model(batch)
            batch_predictions = outputs.cpu().numpy()
            predictions.extend(batch_predictions.flatten())
    
    # Inverse transform predictions
    predictions = np.array(predictions).reshape(-1, 1)
    predictions_original = scaler_y.inverse_transform(predictions).flatten()
    
    return predictions_original


# Advanced ensemble method
class EnsembleINSModel(nn.Module):
    """Ensemble of top-k architectures from NAS"""
    
    def __init__(self, configs: List[ArchConfig], weights: List[float] = None):
        super().__init__()
        self.models = nn.ModuleList([DynamicINSModel(config) for config in configs])
        self.weights = weights if weights else [1.0] * len(configs)
        self.weights = torch.FloatTensor(self.weights)
        self.weights = self.weights / self.weights.sum()  # Normalize weights
    
    def forward(self, x):
        outputs = []
        for model in self.models:
            outputs.append(model(x))
        
        # Weighted average
        stacked_outputs = torch.stack(outputs, dim=0)  # (num_models, batch_size, output_dim)
        weights = self.weights.to(x.device).view(-1, 1, 1)
        weighted_output = (stacked_outputs * weights).sum(dim=0)
        
        return weighted_output


def create_ensemble_model(nas_results: List[Dict], top_k: int = 3):
    """Create ensemble from top-k NAS results"""
    
    # Get top-k configurations
    top_configs = [result['config'] for result in nas_results[:top_k]]
    
    # Calculate ensemble weights based on validation performance
    val_losses = [result['val_loss'] for result in nas_results[:top_k]]
    
    # Convert losses to weights (lower loss = higher weight)
    max_loss = max(val_losses)
    weights = [max_loss - loss for loss in val_losses]
    
    # Handle case where all losses are the same
    if sum(weights) == 0:
        weights = [1.0] * len(val_losses)
    
    return EnsembleINSModel(top_configs, weights)


class EarlyStopping:
    """Early stopping utility class"""
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1

        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False

    def save_checkpoint(self, model):
        self.best_weights = copy.deepcopy(model.state_dict())


# ====================================================================
# SIMPLE USAGE EXAMPLE
# ====================================================================

def simple_usage_example():
    """
    Simple example of how to use this code with your data
    """
    print("""
SIMPLE USAGE INSTRUCTIONS:

1. Modify the 'configure_your_data()' function:
   - Set your CSV file path
   - Set your target column name
   - Set your feature column names

2. Or modify the 'get_training_config()' function:
   - Set YOUR_TARGET_COLUMN to your target column name
   - Set YOUR_FEATURE_COLUMNS to your feature column names

3. Run the script:
   python nas_ins_model.py

Example configuration for get_training_config():

def get_training_config():
    config = {
        # ... other settings ...
        'YOUR_TARGET_COLUMN': 'velocity',  # Your actual target column
        'YOUR_FEATURE_COLUMNS': ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y']  # Your features
    }
    return config
    """)


if __name__ == "__main__":
    # Check if user needs help
    import sys
    if len(sys.argv) > 1 and sys.argv[1] in ['-h', '--help', 'help']:
        simple_usage_example()
    else:
        main()