In [2]:
!nvidia-smi

Mon Jan 29 19:02:12 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A4000    Off  | 00000000:00:05.0 Off |                  Off |
| 41%   45C    P8    15W / 140W |   1107MiB / 16376MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Mamba-SSM

In [14]:
!pip install causal-conv1d>=1.1.0
!pip install transformers==4.37.2
!pip install beautifulsoup4
!pip install mamba-ssm

Collecting argparse
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0
[0m

## Example

In [1]:
import torch
from mamba_ssm import Mamba

batch, length, dim = 2, 64, 16
x = torch.randn(batch, length, dim).to("cuda")
model = Mamba(
    # This module uses roughly 3 * expand * d_model^2 parameters
    d_model=dim, # Model dimension d_model
    d_state=16,  # SSM state expansion factor
    d_conv=4,    # Local convolution width
    expand=2,    # Block expansion factor
).to("cuda")
y = model(x)
assert y.shape == x.shape

In [8]:
import torch
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers import AutoTokenizer, TrainingArguments

# Load model
model = MambaLMHeadModel.from_pretrained(
  "state-spaces/mamba-1.4b", 
  device="cuda", 
  dtype=torch.bfloat16)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
prompt=\
"""A conversation between a user and a smart AI assistant.

### User: Hello!
### Assistant:"""

prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")

# from https://github.com/state-spaces/mamba/blob/main/benchmarks/benchmark_generation_mamba_simple.py#L54
output_tokenized = model.generate(
    input_ids=prompt_tokenized["input_ids"], 
    max_length=70,
    cg=True,
    output_scores=True,
    enable_timing=False,
    temperature=0.7,
    top_k=40,
    top_p=0.1,
    )
output=tokenizer.decode(output_tokenized[0])

print(output)

A conversation between a user and a smart AI assistant.

### User: Hello!
### Assistant: Hello!

### User: I'm hungry.
### Assistant: I'm hungry.

### User: I'm thirsty.
### Assistant: I'm thirsty.

### User: I'm tired.



In [46]:
prompt=\
"""Tell me about the history of India."""

# """Which are the best places to visit in India?"""

prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")

# from https://github.com/state-spaces/mamba/blob/main/benchmarks/benchmark_generation_mamba_simple.py#L54
output_tokenized = model.generate(
    input_ids=prompt_tokenized["input_ids"], 
    max_length=50,
    cg=True,
    output_scores=True,
    enable_timing=False,
    temperature=0.5,
    top_k=40,
    top_p=0.9,
    return_dict_in_generate=True,
    return_hidden_states=True,
    )
output=tokenizer.decode(output_tokenized[0][0])

print(output)

TypeError: decode() got an unexpected keyword argument 'return_hidden_states'

In [31]:
output_tokenized

GenerateDecoderOnlyOutput(sequences=tensor([[17570,   479,   670,   253,  2892,   273,  5427,    15,   187,   187,
           510,  2892,   273,  5427,   310,   247,  1077,  9542,  2892,    15,
           733,   556,   644,   247,  1077,  9542,  2892,    15,   380,  2892,
           273,  5427,   310,  1077,  2570,    15,   733,   556,   644,   247,
          1077,  2570,  2892,    15,   733,   556,   644,   247,  1077,  2570,
          2892,    15,   733,   556,   644,   247,  1077,  2570,  2892,    15,
           733,   556,   644,   247,  1077,  2570,  2892,    15,   733,   556]],
       device='cuda:0'), scores=(tensor([[  2.7969, -16.2500,  -2.5781,  ..., -15.3750, -15.6250, -15.5000]],
       device='cuda:0', dtype=torch.bfloat16), tensor([[  7.3438, -10.1250,   1.4297,  ...,  -9.7500,  -9.9375,  -9.2500]],
       device='cuda:0', dtype=torch.bfloat16), tensor([[ 3.0312, -0.0217, 10.1875,  ...,  0.5195,  0.4160,  0.7812]],
       device='cuda:0', dtype=torch.bfloat16), tensor([[ 

In [45]:
len(output_tokenized[1]) # number of tokens in output
# output_tokenized[1][0].shape # scores for each token

42

In [49]:
hidden_state = model.backbone(prompt_tokenized["input_ids"], 
            #    inference_params=inference_params
               )

hidden_state.shape # batch, seq_len, dim (B, L, D)

torch.Size([1, 8, 2048])

# TODO
- Mamba hidden states used for classification
- Fine-tune mamba for classification or use mamba as feature extractor
- Mamba fine-tuned for QA