## FixMatch active learning analysis

## FixMatch Baseline Analysis

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from torch.optim.lr_scheduler import LambdaLR
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
import os
import math
import pandas as pd
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, average_precision_score
)


# Global font settings
plt.rcParams.update({
    "font.size": 14,
    "font.weight": "bold",
    "axes.labelweight": "bold",
    "axes.titlesize": 16,
    "axes.titleweight": "bold",
    "xtick.labelsize": 13,
    "ytick.labelsize": 13,
    "legend.fontsize": 13,
    "legend.frameon": False
})

# define strategy as global variable
strategy = ""


# === Classifier Definition ===
class Classifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(),
            nn.Linear(512, 384), nn.ReLU(),
            nn.Linear(384, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU()
        )
        self.classifier = nn.Sequential(
            nn.Linear(128, 100), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(100, 100), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(100, num_classes)
            #nn.Linear(100, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.classifier(self.encoder(x))
    def encode(self, x):
        """Encode input features to a lower-dimensional representation."""
        return self.encoder(x)



def get_cosine_schedule_with_warmup(optimizer,
                                    num_warmup_steps,
                                    num_training_steps,
                                    num_cycles=7./16.,
                                    last_epoch=-1):
    def _lr_lambda(current_step):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        no_progress = float(current_step - num_warmup_steps) / \
            float(max(1, num_training_steps - num_warmup_steps))
        return max(0., math.cos(math.pi * num_cycles * no_progress))

    return LambdaLR(optimizer, _lr_lambda, last_epoch)



def split_labeled_unlabeled(X, y, labeled_ratio=0.1, stratify=True, random_state=42):
    n_samples = len(X)
    n_labeled = int(n_samples * labeled_ratio)
    if stratify:
        X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
            X, y, train_size=n_labeled, stratify=y, random_state=random_state
        )
    else:
        X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
            X, y, train_size=n_labeled, random_state=random_state
        )
    return X_labeled, y_labeled, X_unlabeled, y_unlabeled

def random_bit_flip(x, n_bits=1):
    x_aug = x.clone()
    batch_size, num_features = x.shape
    for i in range(batch_size):
        flip_indices = torch.randperm(num_features)[:n_bits]
        x_aug[i, flip_indices] = 1 - x_aug[i, flip_indices]
    return x_aug

def random_bit_flip_bernoulli(x, p=None, n_bits=None):
    """
    Randomly flip each bit in the input tensor with probability p using Bernoulli distribution.
    If n_bits is given, p is set so that on average n_bits are flipped per sample.
    Args:
        x: Tensor of shape (batch_size, num_features)
        p: Probability of flipping each bit (float, between 0 and 1)
        n_bits: If given, overrides p so that p = n_bits / num_features
    Returns:
        Augmented tensor with bits flipped
    """
    x_aug = x.clone()
    batch_size, num_features = x.shape
    device = x.device
    # if n_bits is not None:
    #     p = float(n_bits * 10) / num_features
    # else:
    #     if p is None:
    #         p = 0.01  # default
    #     else:
    #         p = float(p)
    if p is not None:
        p = float(p)
    else:
        p = 0.01  
    flip_mask = torch.bernoulli(torch.full_like(x_aug, p, device=device))
    x_aug = torch.abs(x_aug - flip_mask)
    return x_aug

def random_feature_mask(x, n_mask=1):
    x_aug = x.clone()
    batch_size, num_features = x.shape
    for i in range(batch_size):
        mask_indices = torch.randperm(num_features)[:n_mask]
        x_aug[i, mask_indices] = 0
    return x_aug

def random_bit_flip_and_mask(x, n_bits=1, n_mask=1):
    x_aug = random_bit_flip(x, n_bits=n_bits)
    x_aug = random_feature_mask(x_aug, n_mask=n_mask)
    return x_aug


def train_fixmatch_drift_eval(
    bit_flip, model, optimizer, X_labeled, y_labeled, X_unlabeled, y_unlabeled,
    args, num_classes=2, threshold=0.95, lambda_u=1.0, epochs=200, batch_size=64
):
    labeled_ds = TensorDataset(X_labeled, y_labeled)
    unlabeled_ds = TensorDataset(X_unlabeled)

    train_sampler = RandomSampler if args.local_rank == -1 else DistributedSampler


    labeled_loader = DataLoader(labeled_ds, sampler=train_sampler(labeled_ds), batch_size=batch_size, drop_last=True)
    unlabeled_loader = DataLoader(unlabeled_ds, sampler=train_sampler(unlabeled_ds), batch_size=batch_size, drop_last=True)
    criterion = nn.CrossEntropyLoss(reduction='mean')

    
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
    scheduler = get_cosine_schedule_with_warmup(optimizer, args.warmup, epochs)
    
    best_loss = float('inf')
    best_state_dict = None

    mu = 1  # Number of unlabeled augmentations per sample (FixMatch default is 1)
    interleave_size = 2 * mu + 1  # labeled, unlabeled_weak, unlabeled_strong

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        labeled_iter = iter(labeled_loader)
        unlabeled_iter = iter(unlabeled_loader)

        for _ in range(len(labeled_loader)):
            try:
                x_l, y_l = next(labeled_iter)
                (x_u,) = next(unlabeled_iter)
            except StopIteration:
                break

            x_l, y_l = x_l.cuda(), y_l.cuda()
            x_u = x_u.cuda()
            p = 0.0
            # Weak and strong augmentations for unlabeled data, now with seed
            if args.aug == "random_bit_flip":
                x_u_w = random_bit_flip(x_u, n_bits=1)
                x_u_s = random_bit_flip(x_u, n_bits=bit_flip)
            elif args.aug == "random_bit_flip_bernoulli":
                p = 0.05
                x_u_w = random_bit_flip_bernoulli(x_u, p=0.01, n_bits=None)
                x_u_s = random_bit_flip_bernoulli(x_u, p=p, n_bits=None)
            else:
                raise ValueError(f"Unknown augmentation function: {args.aug}")

            # Interleave all inputs for batchnorm consistency
            inputs = torch.cat([x_l, x_u_w, x_u_s], dim=0)

            logits = model(inputs)

            batch_size = x_l.shape[0]
            logits_x = logits[:batch_size]
            logits_u_w, logits_u_s = logits[batch_size:].chunk(2)

            # Labeled loss
            loss_x = criterion(logits_x, y_l)

            # Unlabeled loss (FixMatch pseudo-labeling)
            with torch.no_grad():
                pseudo_logits = F.softmax(logits_u_w / args.T, dim=1)
                pseudo_labels = torch.argmax(pseudo_logits, dim=1)
                max_probs, _ = torch.max(pseudo_logits, dim=1)
                mask = max_probs.ge(threshold).float()

            loss_u = (F.cross_entropy(logits_u_s, pseudo_labels, reduction='none') * mask).mean()
            loss = loss_x + lambda_u * loss_u

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        if total_loss < best_loss:
            best_loss = total_loss
            best_state_dict = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            
        print(f"Epoch {epoch+1}: loss={total_loss:.4f}")
        scheduler.step()

    # Restore best model after initial training
    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    # === Evaluate on each year's test set ===
    path = "/home/mhaque3/myDir/data/gen_apigraph_drebin/"
    metrics_list = []
    model.eval()
    with torch.no_grad():
        for year in range(2013, 2019):
            for month in range(1, 13):
                try:
                    data = np.load(f"{path}{year}-{month:02d}_selected.npz")
                    X_raw = data["X_train"]
                    y_true = (data["y_train"] > 0).astype(int)
                    X_test = torch.tensor(X_raw, dtype=torch.float32).cuda()
                    y_test = torch.tensor(y_true, dtype=torch.long).cuda()
                    plot_malware_tsne_with_boundary(
                        model,
                        X_labeled, y_labeled,
                        X_unlabeled, y_unlabeled,
                        X_test, y_test,
                        strategy=strategy + f"_{year}_{month}",
                        boundary_k=200
                    )

                    logits = model(X_test)
                    probs = torch.softmax(logits, dim=1) if logits.shape[1] > 1 else torch.sigmoid(logits)
                    preds = logits.argmax(dim=1)
                    y_true = y_test.cpu().numpy()
                    y_pred = preds.cpu().numpy()
                    if probs.shape[1] == 2:
                        y_score = probs[:, 1].cpu().numpy()
                    else:
                        y_score = probs.cpu().numpy()  # for multi-class

                    acc = accuracy_score(y_true, y_pred)
                    prec = precision_score(y_true, y_pred, zero_division=0)
                    rec = recall_score(y_true, y_pred, zero_division=0)
                    f1 = f1_score(y_true, y_pred, zero_division=0)
                    cm = confusion_matrix(y_true, y_pred)
                    if cm.shape == (2, 2):
                        tn, fp, fn, tp = cm.ravel()
                        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
                        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
                    else:
                        fnr = fpr = float('nan')

                    # ROC-AUC and PR-AUC (binary or multiclass)
                    try:
                        if probs.shape[1] == 2:
                            roc_auc = roc_auc_score(y_true, y_score)
                            pr_auc = average_precision_score(y_true, y_score)
                        else:
                            roc_auc = roc_auc_score(y_true, probs.cpu().numpy(), multi_class='ovr')
                            pr_auc = average_precision_score(y_true, probs.cpu().numpy(), average='weighted')
                    except Exception:
                        roc_auc = pr_auc = float('nan')

                    metrics_list.append({
                        'year': f"{year}_{month}",
                        'accuracy': acc,
                        'precision': prec,
                        'recall': rec,
                        'f1': f1,
                        'fnr': fnr,
                        'fpr': fpr,
                        'roc_auc': roc_auc,
                        'pr_auc': pr_auc
                    })

                    print(f"Year {year}_{month}: Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}, FNR={fnr:.4f}, FPR={fpr:.4f}, ROC-AUC={roc_auc:.4f}, PR-AUC={pr_auc:.4f}")

                except FileNotFoundError:
                    continue

    # Save results to CSV
    
    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.to_csv(f"analysis/{strategy}.csv", index=False)

    print(f"Mean F1 Scores: {metrics_df['f1'].mean():.4f}")
    print(f"Mean False Negative Rates: {metrics_df['fnr'].mean()}")
    print(f"Mean False Positive Rates: {metrics_df['fpr'].mean()}")
    


In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def plot_malware_tsne_with_boundary(model, X_labeled, y_labeled, X_unlabeled, y_unlabeled, X_test, y_test, strategy, boundary_k=200):
    """
    Plots t-SNE visualization of labeled, unlabeled, and test data with decision boundary samples highlighted.

    Parameters:
        model: Trained model with an encode() method.
        X_labeled, y_labeled: Tensor - labeled data and labels
        X_unlabeled, y_unlabeled: Tensor - unlabeled data and (true or pseudo) labels
        X_test, y_test: Tensor - test data and true labels
        boundary_k: int - number of boundary samples to highlight
    """
    model.eval()
    with torch.no_grad():
        # Encode all sets using the model
        X_labeled_enc = model.encode(X_labeled).cpu()
        X_unlabeled_enc = model.encode(X_unlabeled).cpu()
        X_test_enc = model.encode(X_test).cpu()

    # Stack for t-SNE
    X_all = torch.cat([X_labeled_enc, X_unlabeled_enc, X_test_enc], dim=0)

    # Apply t-SNE
    X_2d = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X_all)

    # Split into individual groups
    n_lab = X_labeled.shape[0]
    n_unlab = X_unlabeled.shape[0]
    X_lab_2d = X_2d[:n_lab]
    X_unlab_2d = X_2d[n_lab:n_lab + n_unlab]
    X_test_2d = X_2d[n_lab + n_unlab:]

    y_lab = y_labeled.cpu()
    y_unlab = y_unlabeled.cpu()
    y_tst = y_test.cpu()

    # ---------- Find boundary samples in X_test ----------
    with torch.no_grad():
        logits_test = model(X_test)
        probs_test = F.softmax(logits_test, dim=1)
        top2 = torch.topk(probs_test, 2, dim=1).values
        margins = top2[:, 0] - top2[:, 1]
        boundary_indices = torch.argsort(margins)[:boundary_k]
        # Using Prediction Confidence:
        # import torch.nn.functional as F

        # with torch.no_grad():
        #     logits = model(X_test)
        #     probs = F.softmax(logits, dim=1)
        #     confidence = probs.max(dim=1).values  # shape: (N,)
        # boundary_indices = torch.argsort(confidence)[:top_k]



    X_boundary_2d = X_test_2d[boundary_indices.cpu()]

    # ---------- Plot ----------
    plt.figure(figsize=(10, 8))

    # Labeled
    plt.scatter(X_lab_2d[y_lab == 0, 0], X_lab_2d[y_lab == 0, 1], c='green', label='Labeled Benign', alpha=0.6, marker='o')
    plt.scatter(X_lab_2d[y_lab == 1, 0], X_lab_2d[y_lab == 1, 1], c='red', label='Labeled Malware', alpha=0.6, marker='o')

    # Unlabeled
    plt.scatter(X_unlab_2d[y_unlab == 0, 0], X_unlab_2d[y_unlab == 0, 1], c='lightgreen', label='Unlabeled Benign', alpha=0.4, marker='s')
    plt.scatter(X_unlab_2d[y_unlab == 1, 0], X_unlab_2d[y_unlab == 1, 1], c='salmon', label='Unlabeled Malware', alpha=0.4, marker='s')

    # Test
    plt.scatter(X_test_2d[y_tst == 0, 0], X_test_2d[y_tst == 0, 1], c='blue', label='Test Benign', alpha=0.6, marker='^')
    plt.scatter(X_test_2d[y_tst == 1, 0], X_test_2d[y_tst == 1, 1], c='purple', label='Test Malware', alpha=0.6, marker='^')

    # Boundary samples
    plt.scatter(X_boundary_2d[:, 0], X_boundary_2d[:, 1], facecolors='none', edgecolors='black',
                linewidths=1.5, s=100, label=f'Boundary Samples (top {boundary_k})')

    plt.title("t-SNE of Malware Dataset with Decision Boundary Samples")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"analysis/{strategy}_tsne_boundary.png", dpi=300)
    plt.show()


In [None]:

import argparse

args_list = [
    "--bit_flip", "11",
    "--labeled_ratio", "0.4",
    "--aug", "random_bit_flip",
    "--seed", "0",
    "--lambda-u", "1",
    "--T", "1",
    "--wdecay", "0.0005",
    "--nesterov",
    "--lr", "0.03",
    "--warmup", "0",
    "--local_rank", "-1"
]

parser = argparse.ArgumentParser(description="Run FixMatch with Bit Flip Augmentation on MLP")
parser.add_argument("--bit_flip", type=int, default=11, help="Number of bits to flip per sample")
parser.add_argument("--labeled_ratio", type=float, default=0.4, help="Ratio of labeled data")
parser.add_argument("--aug", type=str, default="random_bit_flip", help="Augmentation function to use")
parser.add_argument("--seed", type=int, default=0, help="Random seed for reproducibility")
parser.add_argument('--lambda-u', default=1, type=float, help='coefficient of unlabeled loss')
parser.add_argument('--T', default=1, type=float, help='pseudo label temperature')
parser.add_argument('--wdecay', default=5e-4, type=float, help='weight decay')
parser.add_argument('--nesterov', action='store_true', default=True, help='use nesterov momentum')
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
parser.add_argument('--lr', '--learning-rate', default=0.03, type=float, help='initial learning rate')
parser.add_argument('--warmup', default=0, type=float, help='warmup epochs (unlabeled data based)')

# Simulate command-line arguments
args = parser.parse_args(args_list)



# Set random seeds for reproducibility
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# Load data
path = "/home/mhaque3/myDir/data/gen_apigraph_drebin/"
file_path = f"{path}2012-01to2012-12_selected.npz"
data = np.load(file_path, allow_pickle=True)
X, y = data['X_train'], data['y_train']
y = np.array([0 if label == 0 else 1 for label in y])

n_bit_flip = args.bit_flip
labeled_ratio = args.labeled_ratio

strategy = f"fixmatch_wo_al_{args.aug}_{n_bit_flip}_lbr_{labeled_ratio}_seed_{args.seed}"
print(f"Running {strategy}...")
print(f"Using {n_bit_flip} bits to flip per sample. Labeled ratio: {labeled_ratio}, Seed: {args.seed}")

# Use the labeled_ratio argument here!
X_labeled, y_labeled, X_unlabeled, y_unlabeled = split_labeled_unlabeled(X, y, labeled_ratio=labeled_ratio, random_state=args.seed)

X_2012_labeled = torch.tensor(X_labeled, dtype=torch.float32).cuda()
y_2012_labeled = torch.tensor(y_labeled, dtype=torch.long).cuda()
X_2012_unlabeled = torch.tensor(X_unlabeled, dtype=torch.float32).cuda()
y_2012_unlabeled = torch.tensor(y_unlabeled, dtype=torch.long).cuda()

input_dim = X_2012_labeled.shape[1]
num_classes = len(torch.unique(y_2012_labeled))



model = Classifier(input_dim=input_dim, num_classes=num_classes).cuda()

no_decay = ['bias', 'bn']
grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(
        nd in n for nd in no_decay)], 'weight_decay': args.wdecay},
    {'params': [p for n, p in model.named_parameters() if any(
        nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = torch.optim.SGD(grouped_parameters, lr=args.lr,
                        momentum=0.9, nesterov=args.nesterov)

train_fixmatch_drift_eval(
    n_bit_flip,
    model,
    optimizer,
    X_2012_labeled,
    y_2012_labeled,
    X_2012_unlabeled,
    y_unlabeled,
    args,
    num_classes=num_classes
)



Running fixmatch_wo_al_random_bit_flip_11_lbr_0.4_seed_0...
Using 11 bits to flip per sample. Labeled ratio: 0.4, Seed: 0
Epoch 1: loss=65.2853


KeyboardInterrupt: 