In [1]:
import numpy as np
import pandas as pd
from robust_models.IRM_model.IRMClassifier import IRMClassifier
from robust_models.DRO_model.DROClassifier import GroupDROClassifier
from robust_models.DRO_model.AdversarialDRO import AdversarialLabelDRO
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, f1_score
import torch.nn as nn
import torch

In [2]:
def nll_loss(true_labels, predicted_probs):
    total_nll = 0.0
    for i in range(len(true_labels)):
        true_class = true_labels[i]
        # Get predicted probability for the true class
        prob_true_class = predicted_probs[i, true_class]
        
        # Avoid log(0) which would be infinity
        if prob_true_class < 1e-15:
            prob_true_class = 1e-15
            
        # Calculate negative log of the probability
        sample_nll = -np.log(prob_true_class)
        total_nll += sample_nll

    
    # Return average NLL
    return total_nll / len(true_labels)

In [3]:
data = pd.read_csv('../data/electricity_source.csv', index_col=[0])
data[list(data.columns)[-1]] = data[list(data.columns)[-1]].astype(int)
X, y = data[list(data.columns)[:-1]].values, data[list(data.columns)[-1]].values
data_target = pd.read_csv('../data/electricity_target.csv', index_col=[0])
data_target[list(data_target.columns)[-1]] = data_target[list(data_target.columns)[-1]].astype(int)
data_target.reset_index(inplace=True, drop=True)
X_ood, y_ood = data_target[list(data_target.columns)[:-1]].values, data_target[list(data_target.columns)[-1]].values

In [4]:

model = AdversarialLabelDRO(
    input_dim=X.shape[1],  # Use actual feature count
    hidden_dims=[64, 32],
    eta_pi=0.1,
    r=0.1
)

# Train model
model.fit(X, y, epochs=20, batch_size=64)

# Make predictions
predictions = model.predict(X_ood)
f1_score(y_ood, predictions)

0.5347036463466142

In [14]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


def generate_multi_domain_data(X, y, n_domains=2, method='random', random_state=42):
    """
    Split dataset into multiple domains using specified method.
    
    Args:
        X: Input features (numpy array)
        y: Target labels (numpy array)
        n_domains: Number of domains to create
        method: 
            'random' - random assignment
            'kmeans' - clustering-based domains
            'label_shift' - domains based on conditional label distribution p(y|x)
        random_state: Random seed for reproducibility
        
    Returns:
        X, y, domain_labels: Original features, labels, and generated domain labels
    """
    np.random.seed(random_state)
    domain_labels = np.zeros(len(X), dtype=int)
    
    if method == 'random':
        # Randomly assign each sample to a domain
        domain_labels = np.random.randint(0, n_domains, size=len(X))
    
    elif method == 'kmeans':
        # Use KMeans clustering to create domain boundaries
        kmeans = KMeans(n_clusters=n_domains, random_state=random_state, n_init='auto')
        domain_labels = kmeans.fit_predict(X)
        
        # If we get fewer clusters than requested, assign randomly
        unique_domains = np.unique(domain_labels)
        if len(unique_domains) < n_domains:
            missing = n_domains - len(unique_domains)
            for i in range(missing):
                domain_labels[np.random.choice(len(X))] = len(unique_domains) + i
    
    elif method == 'label_shift':
        # Train a model to estimate p(y|x)
        X_train, X_cal, y_train, y_cal = train_test_split(
            X, y, test_size=0.5, stratify=y, random_state=random_state
        )
        
        # Train simple logistic regression model
        model = LogisticRegression(max_iter=1000, random_state=random_state)
        model.fit(X_train, y_train)
        
        # Get predicted probabilities on calibration set
        probs = model.predict_proba(X_cal)
        
        # Compute confidence scores (max probability per sample)
        confidences = np.max(probs, axis=1)
        
        # Create domain boundaries based on confidence quantiles
        quantiles = np.linspace(0, 1, n_domains + 1)
        bin_edges = np.quantile(confidences, quantiles)
        
        # Assign domain labels to calibration set
        cal_domains = np.digitize(confidences, bin_edges[1:-1], right=False)
        
        # Assign domain labels to full dataset
        domain_labels = np.zeros(len(X), dtype=int) - 1  # Initialize with -1
        
        # Assign calibration set domains
        domain_labels[X_cal.index if hasattr(X_cal, 'index') else 
                      np.arange(len(X))[len(X_train):]] = cal_domains
        
        # Assign training set to most similar domain
        train_probs = model.predict_proba(X_train)
        train_conf = np.max(train_probs, axis=1)
        train_domains = np.digitize(train_conf, bin_edges[1:-1], right=False)
        domain_labels[X_train.index if hasattr(X_train, 'index') else 
                     np.arange(len(X))[:len(X_train)]] = train_domains
        
        # Handle any unassigned samples (shouldn't occur, but safety)
        unassigned = domain_labels == -1
        if np.any(unassigned):
            domain_labels[unassigned] = np.random.choice(n_domains, size=np.sum(unassigned))
    
    else:
        raise ValueError("method must be 'random', 'kmeans', or 'label_shift'")
    
    return X, y, domain_labels

In [15]:
# Generate data
X, y, domains = generate_multi_domain_data(X, y, method='kmeans')


# Create domain-specific DataLoaders
domain_loaders = []
for domain_id in np.unique(domains):
    domain_mask = (domains == domain_id)
    X_domain = X[domain_mask]
    y_domain = y[domain_mask]
    
    # Convert to PyTorch tensors
    X_tensor = torch.tensor(X_domain, dtype=torch.float32)
    y_tensor = torch.tensor(y_domain, dtype=torch.long)
    
    # Create DataLoader for this domain
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, batch_size=32, shuffle=True)
    domain_loaders.append(loader)
    
    print(f"Domain {domain_id}: {len(X_domain)} samples")

# Create test DataLoader
X_test_tensor = torch.tensor(X_ood, dtype=torch.float32)
y_test_tensor = torch.tensor(y_ood, dtype=torch.long)
test_loader = DataLoader(
    TensorDataset(X_test_tensor, y_test_tensor),
    batch_size=128,
    shuffle=False
)

# Initialize IRM model
model = IRMClassifier(
    input_size=X.shape[1],
    num_classes=len(np.unique(y)),
    hidden_size=64,
    dropout=0.2,
    learning_rate=1e-3,
    irm_lambda=0.8,
    irm_penalty_anneal_iters=500
)

# Train the model
model.fit(domain_loaders, n_iterations=3000)

# Evaluate on test set
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        preds = model.predict(x_batch)
        test_preds.extend(preds)
        test_true.extend(y_batch.numpy())

accuracy = f1_score(test_true, test_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")



[WinError 2] The system cannot find the file specified
  File "c:\Users\Irina\anaconda3\envs\synth_shift\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Irina\anaconda3\envs\synth_shift\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Irina\anaconda3\envs\synth_shift\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Irina\anaconda3\envs\synth_shift\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Domain 0: 9252 samples
Domain 1: 1063 samples

Test Accuracy: 0.6877
