In [1]:
%%capture
!mamba install --force-reinstall aiohttp -y
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"

In [2]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [1]:
import torch
import pandas as pd

In [4]:
torch.__version__

'2.2.2+cu121'

In [5]:
DATA_FILE_PATH = "/kaggle/input/test-gemma-resp/test_gemma_resp.csv"

In [6]:
df = pd.read_csv(DATA_FILE_PATH)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4110 entries, 0 to 4109
Data columns (total 29 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   dataset                                     4110 non-null   object 
 1   question_id                                 4110 non-null   object 
 2   question                                    4110 non-null   object 
 3   context                                     4109 non-null   object 
 4   answer                                      4110 non-null   object 
 5   formatted                                   4110 non-null   object 
 6   tokenized                                   4110 non-null   object 
 7   gemma-2-9b-it-bnb-4bit_finetuned_cuda4      4110 non-null   object 
 8   gemma-2-9b-it-bnb-4bit_finetuned_cuda1      4101 non-null   object 
 9   gemma-2-9b-it-bnb-4bit_finetuned_cuda2      4110 non-null   object 
 10  gemma-2-9b-i

In [8]:
!git clone https://github.com/Hallucination-LLM/Research.git

  pid, fd = os.forkpty()


Cloning into 'Research'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 66 (delta 11), reused 48 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (66/66), 1.71 MiB | 9.31 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [9]:
%cd Research/api

/kaggle/working/Research/api


In [10]:
from src.utils_train import fit, validate, to_dataloader
from src.prompts import QUERY_INTRO_NO_ANS, SYSTEM_MSG
from config import *

In [11]:
def prepare_prompt(
        tokenizer,
        user_input: str, 
        system_input: str = "",
        has_system_role: bool = False) -> list:
    
    messages = []
    
    if has_system_role:
        messages.append({"role": "system", "content": system_input})

    messages = [
        {
            "role": "user", 
            "content": f"{system_input}{user_input}" 
                if not has_system_role 
                else user_input
        },
    ]

    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )

    return prompt

In [12]:
MODEL_ID = "unsloth/gemma-2-9b-it-bnb-4bit"
HF_TOKEN = "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"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [13]:
DEVICE

'cuda:0'

In [14]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_ID,
    max_seq_length=8192,
    dtype=None,
    device_map={"": DEVICE},
    load_in_4bit=True
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Gemma2R

In [15]:
input_text = "Who are you? Please, answer in pirate-speak."
input_ids = tokenizer(input_text, return_tensors="pt").to(DEVICE)

In [16]:
pred = model.forward(
    input_ids=input_ids.get("input_ids"),
    output_hidden_states=True,
#     output_attentions=True # this parameter does not work for models from FastLanguageModel !!!
)

In [17]:
dir(pred)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'attentions',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'hidden_states',
 'items',
 'keys',
 'logits',
 'loss',
 'move_to_end',
 'past_key_values',
 'pop',
 'popitem',
 'setdefault',
 'to_tuple',
 'update',
 'values']

In [19]:
pred.attentions # https://github.com/unslothai/unsloth/issues/950

In [20]:
del pred

In [21]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

639

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [24]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    token=HF_TOKEN
)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [26]:
pred = model.forward(
    input_ids=input_ids.get("input_ids"),
)

AttributeError: 'Gemma2Model' object has no attribute 'max_seq_length'

In [25]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

10489

In [26]:
#########################################################

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"
HF_TOKEN = "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"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, token=HF_TOKEN)
model.to(DEVICE)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [4]:
prompts = torch.load("/kaggle/input/nq-inputs/nq-inputs.pt")

  prompts = torch.load("/kaggle/input/nq-inputs/nq-inputs.pt")


In [5]:
X_attn = []
X_hidden = []

for prompt in prompts[:2]:
    try:
        text = prompt["full_input_text"]
        
        inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = model.forward(**inputs, output_attentions=True, output_hidden_states=True)
                    
        attentions = outputs.attentions
        hidden_states = outputs.hidden_states
        
        attn_tensor = torch.stack(attentions)
        hidden_tensor = torch.stack(hidden_states)
        
        X_attn.append(attn_tensor.cpu())
        X_hidden.append(hidden_tensor.cpu())
                
        del inputs, outputs, attentions, hidden_states, attn_tensor, hidden_tensor
        torch.cuda.empty_cache()

    except Exception as e:
        print(f"Error processing prompt: {e}")


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [6]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

30

In [7]:
X_attn[0].shape, X_attn[1].shape

(torch.Size([32, 1, 32, 535, 535]), torch.Size([32, 1, 32, 373, 373]))

In [8]:
X_hidden[0].shape, X_hidden[1].shape

(torch.Size([33, 1, 535, 4096]), torch.Size([33, 1, 373, 4096]))

In [9]:
X_attn_reduced = []

for attn_tensor in X_attn:
    attn_tensor = attn_tensor.squeeze(1)
    attn_tensor = attn_tensor[:, :, -1, :]
    attn_tensor = attn_tensor.mean(dim=-1)
    attn_flat = attn_tensor.flatten()
    X_attn_reduced.append(attn_flat)

In [10]:
X_attn_reduced[0].shape

torch.Size([1024])

In [11]:
X_hidden_reduced = []
for hidden_tensor in X_hidden:
    hidden_reduced = hidden_tensor.mean(dim=0).squeeze(0)[-1]
    X_hidden_reduced.append(hidden_reduced)

In [12]:
X_hidden_reduced[0].shape

torch.Size([4096])

In [13]:
from torch import nn

class MLP(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.layer_norm = nn.LayerNorm(hidden_dim)
        self.elu = nn.ELU()
        self.dropout = nn.Dropout(0.25)

    def forward(self, x, mode: str = "train"):
        x = self.fc1(x)
        x = self.layer_norm(x)
        x = self.elu(x)

        if mode == "train":
            x = self.dropout(x)

        x = self.fc2(x)
        return x

In [14]:
labels = torch.load("/kaggle/input/nq-labels/nq-labels.pt")

  labels = torch.load("/kaggle/input/nq-labels/nq-labels.pt")


In [20]:
labels = labels[:2]
labels = torch.tensor(labels, dtype=torch.long).unsqueeze(1)  

  labels = torch.tensor(labels, dtype=torch.long).unsqueeze(1)


In [16]:
from torch.utils.data import DataLoader, TensorDataset, random_split

X_attn_reduced = torch.stack(X_attn_reduced).float()

dataset = TensorDataset(X_attn_reduced, labels)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [21]:
import torch.optim as optim

model = MLP(input_dim=1024, hidden_dim=2048, output_dim=1)
model = model.to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

In [20]:
# del model
# del optimizer
# del criterion

In [18]:
def train_model(model, train_loader, optimizer, criterion, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            
            optimizer.zero_grad()

            outputs = model(inputs, mode="train")
            
            loss = criterion(outputs, targets)
            
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")    

In [22]:
train_model(model, train_loader, optimizer, criterion, epochs=10)

ValueError: Target size (torch.Size([1])) must be the same as input size (torch.Size([1, 1]))

In [32]:
import numpy as np

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)

        outputs = model(inputs, mode="eval")
        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(targets.cpu().numpy())

f1 = f1_score(all_labels, all_preds)
return f1

IndentationError: unexpected indent (2909986071.py, line 4)

In [None]:
#########################################################

In [9]:
# df_test = df[:1]

In [12]:
# for i, row in df_test.iterrows():
#     query = row[QUERY_COL]
#     context = row[CONTEXT_COL]
    
#     augumented_prompt = QUERY_INTRO_NO_ANS.format(query = query, context = context)
#     prompt = prepare_prompt(tokenizer, augumented_prompt, SYSTEM_MSG)
#     print(prompt)

NameError: name 'tokenizer' is not defined

In [14]:
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name=MODEL_ID,
#     max_seq_length=8192,
#     dtype=None,
#     device_map={"": DEVICE},
#     load_in_4bit=True
# )

# FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Gemma2 patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 6.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 3584)
    (layers): ModuleList(
      (0-41): 42 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear4bit(in_features=3584, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (v_proj): Linear4bit(in_features=3584, out_features=2048, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3584, bias=False)
          (rotary_emb): GemmaFixedRotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=3584, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=3584, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Gemma2R

In [16]:
# pred = model.forward(
#     input_ids=input_ids.get("input_ids"),
#     output_hidden_states=True,
# )

W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824] WON'T CONVERT fast_rms_layernorm_gemma2_compiled /opt/conda/lib/python3.10/site-packages/unsloth/models/gemma2.py line 65 
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824] due to: 
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824] Traceback (most recent call last):
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824]   File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 786, in _convert_frame
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824]     result = inner_convert(
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824]   File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 400, in _convert_frame_assert
W0824 14:42:04.498000 137771340830528 torch/_dynamo/convert_frame.py:824]     return _compile(
W0824 14:42:04.498000 137771340830528 tor

BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Found Tesla P100-PCIE-16GB which is too old to be supported by the triton GPU compiler, which is used as the backend. Triton only supports devices of CUDA Capability >= 7.0, but your device is of CUDA capability 6.0

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


In [None]:
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"

In [1]:
# !pip install trl==0.8.6

Collecting trl==0.8.6
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl==0.8.6)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.8.6)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.8.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tyro-0.8.8-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.6/104.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.7.1 trl-0.8.6 tyro-0.8.8


In [2]:
# !pip install peft==0.12.0

Collecting peft==0.12.0
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: peft
Successfully installed peft-0.12.0


In [3]:
# !pip install bitsandbytes==0.43.3

Collecting bitsandbytes==0.43.3
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3


In [4]:
# !pip install xformers==0.0.26.post1
# !pip install xformers==0.0.27.post2
# !pip install xformers

Collecting xformers==0.0.26.post1
  Downloading xformers-0.0.26.post1-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting torch==2.3.0 (from xformers==0.0.26.post1)
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0->xformers==0.0.26.post1)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0->xformers==0.0.26.post1)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0->xformers==0.0.26.post1)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0->xformers==0.0.26.post1)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-