In [36]:
import torch
import torch.nn.functional as F
from tqdm import tqdm
import os
import json
import shortuuid
import random
from torch.distributions.normal import Normal
from transformers import AutoModelForCausalLM, AutoTokenizer
import copy
from fastchat.conversation import get_conv_template
import warnings
warnings.filterwarnings("ignore")

In [37]:
# func define
# form inputs
def get_inputs(tokenizer, sentence):
    DEFAULT_TEMPLATE = get_conv_template("vicuna_v1.1")
    # DEFAULT_TEMPLATE = get_conv_template("llama-2")
    DEFAULT_TEMPLATE.sep2 = DEFAULT_TEMPLATE.sep2.strip()
    DEFAULT_TEMPLATE.append_message(DEFAULT_TEMPLATE.roles[0], sentence)
    DEFAULT_TEMPLATE.append_message(DEFAULT_TEMPLATE.roles[1], None)
    prompt = DEFAULT_TEMPLATE.get_prompt()
    # print(prompt)
    indexed_tokens = tokenizer.encode(prompt)
    tokens_tensor = torch.tensor([indexed_tokens]).cuda()
    return tokens_tensor, indexed_tokens

def get_activation_hook(layer_name,activations):
    def hook(model, input, output):
        activations[layer_name] = output[0][-1, -1, :].cpu()
    return hook

def get_activations(model, tokenizer, prompt):
    inputs = get_inputs(tokenizer, prompt)
    with torch.no_grad():
        
        activations={}
        handles=[]
        for i, block in enumerate(model.model.layers):
            handles.append(block.register_forward_hook(get_activation_hook(i,activations)))
        p = model(inputs[0].to(model.device), output_attentions=True, return_dict=True)
        for hook in handles:
            hook.remove()
        return activations,p

def get_subsequent_activation_hook(layer_name,activations):
    def hook(model, input, output):
        if layer_name not in activations:
            activations[layer_name] = []
        activations[layer_name].append(output[0][-1, -1, :].cpu())
    return hook

def get_subsequent_activations_archive(model,tokenizer,prompt): #OOM
    inputs = get_inputs(tokenizer, prompt)
    with torch.no_grad():
        
        activations={}
        handles=[]
        for i, block in enumerate(model.model.layers):
            handles.append(block.register_forward_hook(get_subsequent_activation_hook(i,activations)))
        for i in range(256):
            p = model(inputs[0].to(model.device), output_attentions=True, return_dict=True)
            next_token = p.logits.argmax(-1)[0, -1].item()
            if tokenizer.decode(next_token) == tokenizer.sep_token:
                break
            print(tokenizer.decode(next_token), end='')
            inputs = (torch.cat([inputs[0], p.logits.argmax(-1)], dim=-1),)
            del p  # 显式删除临时变量
            torch.cuda.empty_cache()  # 清理显存缓存，确保显存释放
        for hook in handles:
            hook.remove()
        return activations
    
def gen_model(model,tokenizer,prompt):
    input_ids = get_inputs(tokenizer, prompt)
    stop=len(input_ids[0])
    gen_config = model.generation_config
    with torch.no_grad():
        attn_masks = torch.ones_like(input_ids[0]).to(model.device)
        output_ids = model.generate(input_ids[0],
                                    attention_mask=attn_masks,
                                    generation_config=gen_config,
                                    pad_token_id=tokenizer.pad_token_id,
                                    # top_p=0.9,
                                    do_sample=False,
                                    max_new_tokens=48,
                                    # temperature=0.7
                                    )[0]
        
        gen_str=tokenizer.decode(output_ids[stop:]).strip()
    return gen_str    

def get_subsequent_activations(model, tokenizer, prompt):
    """
    逐步流式生成并输出新 token 的模型生成函数。
    """
    # 初始化输入
    all_activations = []
    handles=[]
    input_ids = get_inputs(tokenizer, prompt)  # 获取初始输入
    stop = len(input_ids[0])  # 记录初始长度，避免重新输出 prompt
    input_ids = input_ids[0].to(model.device)  # 转移到设备
    attention_mask = torch.ones_like(input_ids).to(model.device)  # 注意力掩码

    generated_ids = input_ids  # 初始化生成序列
    gen_config = model.generation_config  # 获取生成配置
    eos_token_id = tokenizer.eos_token_id  # 结束 token ID
    sep_token = tokenizer.sep_token  # 可选，停止生成的标志
    
    print("Generated text: ", end=" ", flush=True)  # 提示生成开始
    with torch.no_grad():
        for _ in range(128):  # 限制最大生成 token 数
            # 模型前向推理，获取 logits
            activations={}
            for i, block in enumerate(model.model.layers):
                handles.append(block.register_forward_hook(get_activation_hook(i,activations)))
                
            outputs = model(
                input_ids=generated_ids,
                attention_mask=attention_mask,
                use_cache=True,  # 使用缓存提升性能
                return_dict=True,
            )
            logits = outputs.logits[:, -1, :]  # 只取最后一个 token 的 logits

            # 根据 logits 选择下一个 token（如采样或贪婪解码）
            next_token_id = torch.argmax(logits, dim=-1).unsqueeze(0)  # 贪婪解码
            next_token = tokenizer.decode(next_token_id.item())  # 解码新 token

            # 输出新生成的 token
            print(next_token, end=" ", flush=True)

            # 停止条件：检测到结束标志（如 eos 或 sep_token）
            if next_token_id.item() == eos_token_id or next_token == sep_token:
                break

            # 更新生成序列，用于下一步推理
            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
            attention_mask = torch.cat(
                [attention_mask, torch.ones_like(next_token_id)], dim=-1
            )
            all_activations.append(activations)
            for hook in handles:
                hook.remove()

    print()  # 生成结束换行
    # generated_text = tokenizer.decode(generated_ids[stop:], skip_special_tokens=True)
    return all_activations




In [38]:
# get positive and negative activations
# load model
model_path = '/mnt/data/users/Lang_Gao/proj/models/vicuna-7b-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [39]:
# load data
positive_data = [i.strip() for i in open('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/data/simply_good.txt','r').readlines()]
negative_data = [i.strip() for i in open('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/data/simply_bad.txt','r').readlines()]

In [40]:
# sort
positive_data = sorted(positive_data, key=lambda x: len(x))
negative_data = sorted(negative_data, key=lambda x: len(x))


In [41]:
# obtain activations and form datasets
from tqdm import tqdm
data=[]
labels=[]
subsets=[]
for i in tqdm(positive_data):
    activations = get_activations(model, tokenizer, i)[0]
    # move data
    for k,v in activations.items():
        data.append(v)
        labels.append(1) # 1=positive
        subsets.append(k)
for i in tqdm(negative_data):
    activations = get_activations(model, tokenizer, i)[0]
    # move data
    for k,v in activations.items():
        data.append(v)
        labels.append(0) # 0=negative
        subsets.append(k)

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [01:21<00:00, 12.33it/s]
100%|██████████| 1000/1000 [01:28<00:00, 11.36it/s]


In [6]:
# interrupt: using reduced data
import torch
positive_data=torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/reduced_pos_data_simply_good.pt')
negative_data=torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/reduced_neg_data_simply_bad.pt')

# obtain activations and form datasets
from tqdm import tqdm
data=[]
labels=[]
subsets=[]
# move data
for k in range(32):
    for v in positive_data[k]:
        
        data.append(v)
        labels.append(1) # 1=positive
        subsets.append(k)
    for v in negative_data[k]:
        data.append(v)
        labels.append(0) # 0=negative
        subsets.append(k)

  positive_data=torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/reduced_pos_data.pt')
  negative_data=torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/reduced_neg_data.pt')


In [44]:
# make data into tensor
data = torch.stack(data)
labels = torch.tensor(labels)
subsets = torch.tensor(subsets)

In [56]:
# formalize dataset
import torch
from torch.utils.data import Dataset, DataLoader
class ActivationDataset(Dataset):
    def __init__(self, data, labels,subset_idxs,device='cpu'):
        self.data = data.to(device)
        self.labels = labels.to(device)
        self.subset_idxs = subset_idxs.to(device)
        self.subset=None

    def select_subset(self,subset_idx):
        self.subset= ActivationDataset(self.data[self.subset_idxs==subset_idx], self.labels[self.subset_idxs==subset_idx], self.subset_idxs[self.subset_idxs==subset_idx])
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx], self.subset_idxs[idx] # for convenient check
    
dataset=ActivationDataset(data,labels,subsets,device='cpu')

In [57]:
# reform data
pos_reform_data={i:[] for i in range(32)}
neg_reform_data={i:[] for i in range(32)}
for item in dataset:
    # print(item)
    # break
    if item[1]==1:
        pos_reform_data[item[2].item()].append(item[0])
    else:
        neg_reform_data[item[2].item()].append(item[0])
pos_data=[torch.stack(pos_reform_data[i]) for i in range(32)]
neg_data=[torch.stack(neg_reform_data[i]) for i in range(32)]
torch.save(pos_data,'/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/pos_data_simply_good.pt')
torch.save(neg_data,'/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/neg_data_simply_bad.pt')

In [48]:
# make a logistic regression model:
# input: 4096dim vec
# output a prob of binary classification
# structure: single mlp
import torch
import torch.nn as nn
import torch.nn.functional as F

# 定义 Logistic Regression 模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim=4096):
        super(LogisticRegressionModel, self).__init__()
        # 单层 MLP (全连接层)
        self.fc = nn.Linear(input_dim, 1)  # 输出为 1，用于二元分类

    def forward(self, x):
        # 前向传播
        x = self.fc(x)          # 全连接层
        x = torch.sigmoid(x)    # 使用 sigmoid 激活函数，输出概率值
        return x

In [49]:
dataset.select_subset(31)
dataset.subset[0]

(tensor([-1.2794, -1.9551, -0.2962,  ..., -0.5957, -1.8432, -0.6108],
        device='cuda:0'),
 tensor(1, device='cuda:0'),
 tensor(31, device='cuda:0'))

In [50]:
len(dataset.subset)

2000

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from copy import deepcopy
from tqdm import tqdm

criterion = nn.BCELoss()  # 二元交叉熵损失
input_dim = 2  # 输入向量的维度
num_epochs = 1000  # 训练轮数
batch_size = 32
best_models = []  # 存储每层验证集上表现最好的模型

for layer in range(32):
    # Prepare dataset
    dataset.select_subset(layer)
    assert dataset.subset[0][-1] == layer

    # 划分训练集、验证集和测试集
    total_size = len(dataset.subset)
    val_size = int(0.05 * total_size)  # 5% 验证集
    test_size = int(0.05 * total_size)  # 5% 测试集
    train_size = total_size - val_size - test_size

    train_set, val_set, test_set = random_split(dataset.subset, [train_size, val_size, test_size])

    # 创建 DataLoader
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

    # Init model
    model = LogisticRegressionModel(input_dim).to('cuda')
    model.train()  # 训练模式

    # Optimizer
    optimizer = optim.SGD(model.parameters(), lr=0.05)  # 随机梯度下降，学习率 0.01

    # 记录验证集上最佳性能
    best_val_acc = 0.0
    best_model_state = None  # 保存最佳模型的状态

    # Train model
    with tqdm(total=num_epochs, desc=f"Training Layer {layer}", unit="epoch") as pbar:
        for epoch in range(num_epochs):
            running_loss = 0.0  # 累计损失

            # 训练模型
            for batch_idx, (data, labels, _) in enumerate(train_loader):
                data, labels = data.to('cuda'), labels.to('cuda')  # 将数据放到 GPU
                optimizer.zero_grad()  # 清零梯度
                outputs = model(data)  # 前向传播
                loss = criterion(outputs, labels.float().unsqueeze(1))  # 计算损失

                loss.backward()  # 反向传播
                optimizer.step()  # 更新权重
                running_loss += loss.item()  # 累积损失

            # 计算当前 epoch 的平均损失
            avg_loss = running_loss / len(train_loader)

            # 验证模型
            model.eval()  # 验证时切换到评估模式
            correct = 0
            total = 0
            with torch.no_grad():  # 禁用梯度计算
                for val_data, val_labels, _ in val_loader:
                    val_data, val_labels = val_data.to('cuda'), val_labels.to('cuda')
                    val_outputs = model(val_data)
                    val_predictions = (val_outputs >= 0.5).float()  # 转化为硬标签
                    correct += (val_predictions.squeeze(1) == val_labels).sum().item()
                    total += val_labels.size(0)
            val_acc = correct / total  # 验证集准确率

            # 如果验证集准确率更高，保存当前模型
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_model_state = deepcopy(model.state_dict())  # 保存当前模型参数

            model.train()  # 恢复训练模式

            # 更新进度条，显示当前 epoch 的损失和验证集准确率
            pbar.set_postfix(loss=avg_loss, val_acc=val_acc)
            pbar.update(1)  # 更新进度条

    # 保存每层验证集上表现最好的模型
    best_model = LogisticRegressionModel(input_dim).to('cuda')
    best_model.load_state_dict(best_model_state)  # 加载最佳模型参数
    best_models.append(best_model)  # 保存最佳模型

    # 清理当前模型以释放 GPU 内存
    del model
    del optimizer
    torch.cuda.empty_cache()

# 测试模型
for layer, model in enumerate(best_models, start=15):
    model.eval()  # 测试时切换到评估模式
    correct = 0
    total = 0
    with torch.no_grad():  # 禁用梯度计算
        for test_data, test_labels, _ in test_loader:
            test_data, test_labels = test_data.to('cuda'), test_labels.to('cuda')
            test_outputs = model(test_data)
            test_predictions = (test_outputs >= 0.5).float()  # 转化为硬标签
            correct += (test_predictions.squeeze(1) == test_labels).sum().item()
            total += test_labels.size(0)
    test_acc = correct / total  # 测试集准确率
    print(f"Layer {layer} Test Accuracy: {test_acc:.4f}")

Training Layer 0: 100%|██████████| 1000/1000 [00:23<00:00, 41.84epoch/s, loss=0.69, val_acc=0.57]
Training Layer 1: 100%|██████████| 1000/1000 [00:23<00:00, 41.95epoch/s, loss=0.71, val_acc=0.49]
Training Layer 2: 100%|██████████| 1000/1000 [00:23<00:00, 41.76epoch/s, loss=0.647, val_acc=0.66]
Training Layer 3: 100%|██████████| 1000/1000 [00:23<00:00, 41.81epoch/s, loss=0.667, val_acc=0.57]
Training Layer 4: 100%|██████████| 1000/1000 [00:23<00:00, 41.75epoch/s, loss=1.18, val_acc=0.54]
Training Layer 5: 100%|██████████| 1000/1000 [00:23<00:00, 42.01epoch/s, loss=1.08, val_acc=0.43]
Training Layer 6: 100%|██████████| 1000/1000 [00:24<00:00, 41.48epoch/s, loss=0.743, val_acc=0.69]
Training Layer 7: 100%|██████████| 1000/1000 [00:24<00:00, 41.35epoch/s, loss=0.634, val_acc=0.62]
Training Layer 8: 100%|██████████| 1000/1000 [00:24<00:00, 41.17epoch/s, loss=1.22, val_acc=0.55]
Training Layer 9: 100%|██████████| 1000/1000 [00:24<00:00, 41.24epoch/s, loss=1.13, val_acc=0.71]
Training Layer 1

Layer 15 Test Accuracy: 0.5300
Layer 16 Test Accuracy: 0.5000
Layer 17 Test Accuracy: 0.4800
Layer 18 Test Accuracy: 0.5200
Layer 19 Test Accuracy: 0.5400
Layer 20 Test Accuracy: 0.4800
Layer 21 Test Accuracy: 0.5300
Layer 22 Test Accuracy: 0.4800
Layer 23 Test Accuracy: 0.4800
Layer 24 Test Accuracy: 0.4800
Layer 25 Test Accuracy: 0.4900
Layer 26 Test Accuracy: 0.5200
Layer 27 Test Accuracy: 0.5200
Layer 28 Test Accuracy: 0.5300
Layer 29 Test Accuracy: 0.5200
Layer 30 Test Accuracy: 0.5200
Layer 31 Test Accuracy: 0.5200
Layer 32 Test Accuracy: 0.5200
Layer 33 Test Accuracy: 0.4900
Layer 34 Test Accuracy: 0.5200
Layer 35 Test Accuracy: 0.5700
Layer 36 Test Accuracy: 0.4800
Layer 37 Test Accuracy: 0.5200
Layer 38 Test Accuracy: 0.5000
Layer 39 Test Accuracy: 0.4600
Layer 40 Test Accuracy: 0.5200
Layer 41 Test Accuracy: 0.5000
Layer 42 Test Accuracy: 0.6500
Layer 43 Test Accuracy: 0.4800
Layer 44 Test Accuracy: 0.5700
Layer 45 Test Accuracy: 0.4800
Layer 46 Test Accuracy: 0.4900


SKlearn-based CAV extraction

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
accs=[]
all_logreg = []
# 将数据从 GPU 转移到 CPU 并转换为 numpy 数组
for layer in range(32):
    dataset.select_subset(layer)
    data_cpu = dataset.subset.data.cpu().numpy()
    labels_cpu = dataset.subset.labels.cpu().numpy()

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(data_cpu, labels_cpu, test_size=0.2, random_state=42)

    # 初始化逻辑回归模型
    log_reg = LogisticRegression()

    # 训练模型
    log_reg.fit(X_train, y_train)

    # 预测
    y_pred = log_reg.predict(X_test)

    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    accs.append(accuracy)
    print(f"layer{layer} Accuracy: {accuracy:.4f}")
    all_logreg.append(deepcopy(log_reg))
    del log_reg

layer0 Accuracy: 0.8475
layer1 Accuracy: 0.8425
layer2 Accuracy: 0.9075
layer3 Accuracy: 0.9600
layer4 Accuracy: 0.9900
layer5 Accuracy: 0.9925
layer6 Accuracy: 0.9950
layer7 Accuracy: 0.9950
layer8 Accuracy: 0.9950
layer9 Accuracy: 0.9950
layer10 Accuracy: 0.9950
layer11 Accuracy: 0.9950
layer12 Accuracy: 0.9950
layer13 Accuracy: 0.9950
layer14 Accuracy: 0.9975
layer15 Accuracy: 0.9975
layer16 Accuracy: 0.9975
layer17 Accuracy: 0.9975
layer18 Accuracy: 0.9975
layer19 Accuracy: 0.9975
layer20 Accuracy: 0.9975
layer21 Accuracy: 0.9975
layer22 Accuracy: 0.9975
layer23 Accuracy: 0.9950
layer24 Accuracy: 0.9950
layer25 Accuracy: 0.9950
layer26 Accuracy: 0.9975
layer27 Accuracy: 0.9950
layer28 Accuracy: 0.9975
layer29 Accuracy: 0.9975
layer30 Accuracy: 0.9975
layer31 Accuracy: 0.9975


In [52]:
valid_dircs
sorted_accs = sorted(enumerate(accs), key=lambda x: x[1], reverse=True)
for idx, value in sorted_accs:
    print(f"Index: {idx}, Accuracy: {value}")

Index: 14, Accuracy: 0.9975
Index: 15, Accuracy: 0.9975
Index: 16, Accuracy: 0.9975
Index: 17, Accuracy: 0.9975
Index: 18, Accuracy: 0.9975
Index: 19, Accuracy: 0.9975
Index: 20, Accuracy: 0.9975
Index: 21, Accuracy: 0.9975
Index: 22, Accuracy: 0.9975
Index: 26, Accuracy: 0.9975
Index: 28, Accuracy: 0.9975
Index: 29, Accuracy: 0.9975
Index: 30, Accuracy: 0.9975
Index: 31, Accuracy: 0.9975
Index: 6, Accuracy: 0.995
Index: 7, Accuracy: 0.995
Index: 8, Accuracy: 0.995
Index: 9, Accuracy: 0.995
Index: 10, Accuracy: 0.995
Index: 11, Accuracy: 0.995
Index: 12, Accuracy: 0.995
Index: 13, Accuracy: 0.995
Index: 23, Accuracy: 0.995
Index: 24, Accuracy: 0.995
Index: 25, Accuracy: 0.995
Index: 27, Accuracy: 0.995
Index: 5, Accuracy: 0.9925
Index: 4, Accuracy: 0.99
Index: 3, Accuracy: 0.96
Index: 2, Accuracy: 0.9075
Index: 0, Accuracy: 0.8475
Index: 1, Accuracy: 0.8425


In [54]:
weights = [torch.tensor(log_reg.coef_).squeeze() for log_reg in all_logreg]
weights_tensor_list = [w for w in weights]
torch.save(weights_tensor_list,'/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/cavs_simply_good_bad.pt')


In [23]:
torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/cavs_length.pt')

  torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/cavs_length.pt')


[tensor([-0.0037, -0.0147, -0.0138,  ..., -0.0036,  0.0276, -0.0007]),
 tensor([-0.0236, -0.0168, -0.0200,  ...,  0.0079, -0.0116, -0.0190]),
 tensor([-0.0241, -0.0123, -0.0055,  ...,  0.0132,  0.0051,  0.0043]),
 tensor([-0.0194,  0.0142, -0.0126,  ...,  0.0195, -0.0055,  0.0088]),
 tensor([-0.0118, -0.0060,  0.0106,  ...,  0.0272, -0.0093, -0.0076]),
 tensor([-0.0167, -0.0020,  0.0057,  ...,  0.0152, -0.0017,  0.0078]),
 tensor([ 0.0097, -0.0115, -0.0031,  ..., -0.0041, -0.0034, -0.0057]),
 tensor([ 0.0057, -0.0185, -0.0208,  ...,  0.0011, -0.0134, -0.0049]),
 tensor([-0.0103, -0.0162, -0.0062,  ...,  0.0109, -0.0029, -0.0049]),
 tensor([-0.0082, -0.0094,  0.0133,  ...,  0.0108,  0.0025,  0.0023]),
 tensor([ 0.0058, -0.0067,  0.0078,  ...,  0.0139,  0.0025,  0.0042]),
 tensor([ 0.0057, -0.0022,  0.0139,  ..., -0.0033,  0.0088, -0.0180]),
 tensor([-0.0081,  0.0042,  0.0146,  ..., -0.0040,  0.0103, -0.0026]),
 tensor([-0.0052, -0.0053, -0.0059,  ...,  0.0018,  0.0080, -0.0067]),
 tenso

In [19]:
# reference data
a=torch.load('/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/bkg_acts_alpaca_tensor.pt')
a

[tensor([[-0.0561, -0.0199,  0.0241,  ...,  0.0231, -0.0102,  0.0173],
         [-0.0562, -0.0206,  0.0216,  ...,  0.0220, -0.0154,  0.0184],
         [-0.0546, -0.0227,  0.0148,  ...,  0.0211, -0.0092,  0.0155],
         ...,
         [-0.0464, -0.0305,  0.0067,  ...,  0.0178, -0.0087,  0.0136],
         [-0.0491, -0.0295,  0.0097,  ...,  0.0201, -0.0114,  0.0133],
         [-0.0502, -0.0304,  0.0107,  ...,  0.0201, -0.0099,  0.0134]]),
 tensor([[-0.0706, -0.0041,  0.0222,  ...,  0.0173, -0.0060,  0.0029],
         [-0.0757, -0.0035,  0.0211,  ...,  0.0191, -0.0179,  0.0031],
         [-0.0791,  0.0043,  0.0129,  ...,  0.0140, -0.0065,  0.0205],
         ...,
         [-0.0687, -0.0253, -0.0131,  ...,  0.0062, -0.0106,  0.0273],
         [-0.0726, -0.0248, -0.0091,  ...,  0.0054, -0.0208,  0.0236],
         [-0.0721, -0.0249, -0.0107,  ...,  0.0060, -0.0188,  0.0235]]),
 tensor([[-0.0469,  0.0483,  0.0085,  ..., -0.0172,  0.0357,  0.0558],
         [-0.0528,  0.0461,  0.0095,  ..., -0

In [21]:
models[0].fc.weight.shape

torch.Size([1, 4096])

In [23]:
cavs=[]
for model in models:
    w=model.fc.weight.squeeze()
    cavs.append(w/torch.norm(w))
torch.save(cavs,'/mnt/data/users/Lang_Gao/proj/My_Proj/Fairness_Mechanisms/tensors/cavs_length.pt')
    