In [10]:
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import sys
import os

# Add the src directory to the path to import from common
from common.datasets import CubeDataset

# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Generate the Cube dataset
print("Generating Cube dataset...")
dataset = CubeDataset(
    n_features=20,
    n_samples=1000,  # Reduced for faster execution
    seed=SEED,
    non_informative_feature_mean=0.5,
    informative_feature_variance=0.2,
    non_informative_feature_variance=0.3
)
dataset.generate_data()

# Get features and labels
features, labels = dataset.get_all_data()
print(f"Dataset shape: {features.shape}")
print(f"Number of classes: {len(torch.unique(labels))}")
print(f"Class distribution: {torch.bincount(labels.long(), minlength=8)}")

# Convert to numpy for scikit-learn
X = features.numpy()
y = labels.numpy()

# Split into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Train a random forest classifier
print("\nTraining Random Forest classifier...")
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=SEED,
    n_jobs=-1  # Use all available cores
)
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest accuracy: {accuracy:.4f}")



Generating Cube dataset...
Dataset shape: torch.Size([1000, 20])
Number of classes: 8
Class distribution: tensor([133, 117, 113, 146, 125, 113, 119, 134])
Train set shape: (800, 20)
Test set shape: (200, 20)

Training Random Forest classifier...

Test accuracy: 0.8600


In [19]:
import torch
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import sys
import os

# Add the src directory to the path to import from common
from common.datasets import AFAContextDataset

# Set random seed for reproducibility
SEED = 49
torch.manual_seed(SEED)
np.random.seed(SEED)

# Generate the AFAContext dataset
print("Generating AFAContext dataset...")
dataset = AFAContextDataset(
    n_samples=1000,  # Reduced for faster execution
    sigma_bin=0.1,
    sigma_cube=0.3,
    bin_feature_cost=5.0,
    n_dummy_features=10,
    seed=SEED,
    non_informative_feature_mean=0.5,
    non_informative_feature_variance=0.3
)
dataset.generate_data()

# Get features and labels
features, labels = dataset.get_all_data()
print(f"Dataset shape: {features.shape}")
print(f"Number of classes: {len(torch.unique(labels))}")
print(f"Class distribution: {torch.bincount(labels.long(), minlength=8)}")

# Convert to numpy for scikit-learn
X = features.numpy()
y = labels.numpy()

# Split into train and test sets (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# Function to train and evaluate a random forest on a subset of features
def train_and_evaluate_rf(X_train, X_test, y_train, y_test, feature_indices, description):
    print(f"\nTraining Random Forest classifier on {description}...")
    
    # Select features
    X_train_subset = X_train[:, feature_indices]
    X_test_subset = X_test[:, feature_indices]
    
    # Train a random forest classifier
    rf_classifier = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=SEED,
        n_jobs=-1  # Use all available cores
    )
    rf_classifier.fit(X_train_subset, y_train)
    
    # Make predictions
    y_pred = rf_classifier.predict(X_test_subset)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy: {accuracy:.4f}")
    
    return accuracy

# 1. Train on first 10 features
first_10_features = list(range(10))
first_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    first_10_features, 
    "first 10 features"
)

# 2. Train on next 10 features
next_10_features = list(range(10, 20))
next_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    next_10_features, 
    "next 10 features"
)

# 3. Train on all features
all_features = list(range(X.shape[1]))
all_features_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_features, 
    "all features"
)

# 4. Train on all features EXCEPT the first 10
all_except_first_10 = list(range(10, X.shape[1]))
all_except_first_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_except_first_10, 
    "all features except first 10"
)

# 5. Train on all features EXCEPT the second 10
all_except_second_10 = list(range(0, 10)) + list(range(20, X.shape[1]))
all_except_second_10_accuracy = train_and_evaluate_rf(
    X_train, X_test, y_train, y_test, 
    all_except_second_10, 
    "all features except second 10"
)

# Compare accuracies
print("\nAccuracy Comparison:")
print(f"First 10 features: {first_10_accuracy:.4f}")
print(f"Next 10 features: {next_10_accuracy:.4f}")
print(f"All features: {all_features_accuracy:.4f}")
print(f"All features except first 10: {all_except_first_10_accuracy:.4f}")
print(f"All features except second 10: {all_except_second_10_accuracy:.4f}")


Generating AFAContext dataset...
Dataset shape: torch.Size([1000, 30])
Number of classes: 8
Class distribution: tensor([111, 110, 156, 129, 129, 118, 109, 138])
Train set shape: (800, 30)
Test set shape: (200, 30)

Training Random Forest classifier on first 10 features...
Test accuracy: 0.9900

Training Random Forest classifier on next 10 features...
Test accuracy: 0.7950

Training Random Forest classifier on all features...
Test accuracy: 0.9050

Training Random Forest classifier on all features except first 10...
Test accuracy: 0.8350

Training Random Forest classifier on all features except second 10...
Test accuracy: 0.9600

Accuracy Comparison:
First 10 features: 0.9900
Next 10 features: 0.7950
All features: 0.9050
All features except first 10: 0.8350
All features except second 10: 0.9600


In [24]:
y_train[1]

np.int64(2)

In [25]:
X_train[1]

array([ 2.        ,  0.700197  ,  0.35703996, -0.00740662,  0.35492504,
       -0.27242595,  0.19037262, -0.01277946,  0.96048504, -0.10163771,
        0.77107656,  0.25822085, -0.33570838,  0.8615771 ,  0.14473464,
        0.4954611 ,  0.64497924,  1.0047779 ,  0.6330979 ,  0.6313545 ,
        0.4355412 ,  0.48715666,  0.7985137 ,  0.57150555,  0.5015381 ,
        0.13134798,  0.28174898,  0.6638046 ,  0.7241597 ,  0.62216496],
      dtype=float32)

In [27]:
import torch
from jaxtyping import Float
from torch import Tensor
from torch.utils.data import Dataset
from torchvision import datasets, transforms
import math
import pandas as pd
import os

from common.custom_types import FeatureMask, MaskedFeatures, Features, Label
class AFAContextDataset(Dataset):
    """
    A PyTorch Dataset merging AFA structure with cube-dataset dummy-feature behavior.

    Implements the AFADataset protocol.
    """
    def __init__(
        self,
        n_samples: int = 1000,
        sigma_bin: float = 0.1,
        sigma_cube: float = 1.0,
        bin_feature_cost: float = 5.0,
        n_dummy_features: int = 10,
        seed: int = 123,
        non_informative_feature_mean: float = 0.5,
        non_informative_feature_variance: float = 0.3,
    ):
        super().__init__()
        self.n_samples = n_samples
        self.sigma_bin = sigma_bin
        self.sigma_cube = sigma_cube
        self.bin_feature_cost = bin_feature_cost
        self.n_dummy_features = n_dummy_features
        self.seed = seed
        self.non_info_mean = non_informative_feature_mean
        self.non_info_std = math.sqrt(non_informative_feature_variance)

        # Constants
        self.n_classes = 8
        self.n_context_groups = 3
        self.group_size = 3
        self.n_bin_features = self.n_context_groups * self.group_size
        self.n_cube_features = 10

        # Placeholder attributes
        self.features = None
        self.labels = None
        self.costs = None
        self.feature_names = None

        # Generate upon initialization
        self.generate_data()

    def generate_data(self) -> None:
        rng = torch.Generator()
        rng.manual_seed(self.seed)

        # Draw labels and context
        y_int = torch.randint(0, self.n_classes, (self.n_samples,), dtype=torch.int64, generator=rng)
        S = torch.randint(0, self.n_context_groups, (self.n_samples,), dtype=torch.int64, generator=rng)

        # Binary codes for labels (8×3)
        binary_codes = torch.stack([
            torch.tensor([int(b) for b in format(i, '03b')], dtype=torch.float32)
            for i in range(self.n_classes)
        ], dim=0)

        # Initialize feature blocks
        X_context = S.unsqueeze(1).float()

        X_bin = torch.normal(
            mean=self.non_info_mean,
            std=self.non_info_std,
            size=(self.n_samples, self.n_bin_features),
            generator=rng,
        )

        X_cube = torch.normal(
            mean=self.non_info_mean,
            std=self.non_info_std,
            size=(self.n_samples, self.n_cube_features),
            generator=rng,
        )

        X_dummy = torch.normal(
            mean=self.non_info_mean,
            std=self.non_info_std,
            size=(self.n_samples, self.n_dummy_features),
            generator=rng,
        )

        # Insert informative signals
        for i in range(self.n_samples):
            lbl = y_int[i].item()
            ctx = S[i].item()
            mu_bin = binary_codes[lbl]

            # Binary features in active group
            start = ctx * self.group_size
            end = start + self.group_size
            X_bin[i, start:end] = torch.normal(
                mean=0.0,
                std=self.sigma_bin,
                size=(self.group_size,),
                generator=rng
            ) + mu_bin

            # Cube features: 3 bumps
            idxs = [(lbl + j) % self.n_cube_features for j in range(3)]
            X_cube[i, idxs] = torch.normal(
                mean=0.0,
                std=self.sigma_cube,
                size=(3,),
                generator=rng
            ) + mu_bin

        # Concatenate all features
        self.features = torch.cat([X_context, X_bin, X_cube, X_dummy], dim=1)

        # Build costs vector
        total_dim = self.features.shape[1]
        costs = torch.ones(total_dim)
        costs[1:1 + self.n_bin_features] = self.bin_feature_cost
        self.costs = costs

        # One-hot labels
        #self.labels = torch.nn.functional.one_hot(y_int, num_classes=self.n_classes).float()
        self.labels = y_int

        # Feature names
        names = ['context']
        names += [f'bin_{i}' for i in range(self.n_bin_features)]
        names += [f'cube_{i}' for i in range(self.n_cube_features)]
        names += [f'dummy_{i}' for i in range(self.n_dummy_features)]
        self.feature_names = names

    def __getitem__(self, idx: int):
        return self.features[idx], self.labels[idx]

    def __len__(self):
        return self.features.size(0)

    def get_all_data(self):
        return self.features, self.labels

    def save(self, path: str) -> None:
        torch.save(
            {
                'features': self.features,
                'labels': self.labels,
                'costs': self.costs,
                'feature_names': self.feature_names,
                'config': {
                    'n_samples': self.n_samples,
                    'sigma_bin': self.sigma_bin,
                    'sigma_cube': self.sigma_cube,
                    'bin_feature_cost': self.bin_feature_cost,
                    'n_dummy_features': self.n_dummy_features,
                    'seed': self.seed,
                    'non_informative_feature_mean': self.non_info_mean,
                    'non_informative_feature_variance': self.non_info_std ** 2,
                },
            },
            path,
        )

    @staticmethod
    def load(path: str) -> 'AFAContextDataset':
        data = torch.load(path)
        cfg = data['config']
        ds = AFAContextDataset(**cfg)
        ds.features = data['features']
        ds.labels = data['labels']
        ds.costs = data['costs']
        ds.feature_names = data['feature_names']
        return ds

In [28]:
# Set random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

# Generate the AFAContext dataset
print("Generating AFAContext dataset...")
dataset = AFAContextDataset(
    n_samples=1000,  # Reduced for faster execution
    sigma_bin=0.1,
    sigma_cube=1.0,
    bin_feature_cost=5.0,
    n_dummy_features=10,
    seed=SEED,
    non_informative_feature_mean=0.5,
    non_informative_feature_variance=0.3
)
dataset.generate_data()

# Get features and labels
features, labels = dataset.get_all_data()
print(f"Dataset shape: {features.shape}")
print(f"Number of classes: {len(torch.unique(labels))}")
print(f"Class distribution: {torch.bincount(labels.long(), minlength=8)}")

# Convert to numpy for scikit-learn
X = features.numpy()
y = labels.numpy()

Generating AFAContext dataset...
torch.Size([8, 3])
torch.Size([8, 3])
Dataset shape: torch.Size([1000, 30])
Number of classes: 8
Class distribution: tensor([133, 117, 113, 146, 125, 113, 119, 134])
