In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
lmmodel = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

In [5]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")
asrmodel = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v2")


In [6]:
asrmodel

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bias

In [7]:
lmmodel

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [1]:
import torch
import torch.nn as nn
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import AutoTokenizer, AutoModelForCausalLM

class WhisperEncoderLlamaDecoder(nn.Module):
    def __init__(self, 
                 freeze_whisper_encoder: bool = False,
                 freeze_llama_decoder: bool = False):
        super().__init__()

        # 1. Whisper
        self.whisper = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v2")
        # Whisper에서 encoder만 추출
        self.encoder = self.whisper.model.encoder

        # 2. Llama
        self.llama = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")
        # llama.model -> LlamaModel, 그 안의 layers는 전체 Decoder Layers
        llama_layers = self.llama.model.layers

        # 3. Llama 디코더에서 맨 앞 2개 + 맨 뒤 2개 layer만 추출
        self.decoder_layers = nn.ModuleList([
            llama_layers[0],
            llama_layers[1],
            llama_layers[-2],
            llama_layers[-1],
        ])
        
        # Llama의 token embedding, RMSNorm 등도 필요하면 가져와야 함
        self.embed_tokens = self.llama.model.embed_tokens
        self.norm = self.llama.model.norm  # LlamaRMSNorm
        # lm_head 도 사용하려면 self.llama.lm_head 를 쓸 수도 있음

        # 4. 차원 불일치 보정용 브릿지 레이어(Whisper enc -> Llama dec)
        #    Whisper encoder는 hidden_size=1280, Llama는 2048
        whisper_hidden_size = 1280
        llama_hidden_size = 2048
        
        self.bridge = nn.Linear(whisper_hidden_size, llama_hidden_size)
        

        # 5. (옵션) 학습 고정
        if freeze_whisper_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False
        if freeze_llama_decoder:
            for param in self.decoder_layers.parameters():
                param.requires_grad = False

    def forward(self, 
                input_features: torch.Tensor, 
                decoder_input_ids: torch.Tensor,
                attention_mask: torch.Tensor = None):
        """
        input_features: Whisper encoder에 들어갈 audio feature (batch, mel_bins, frames)
        decoder_input_ids: Llama decoder에 들어갈 텍스트 토큰 (batch, seq_len)
        attention_mask: 필요 시 디코더용 마스크
        
        반환: 최종 출력(예: logits)
        """

        # 1. Whisper encoder 전방향
        #    WhisperForConditionalGeneration의 input_features 모양에 맞춤
        #    (batch_size, feature_size, sequence_length) or (batch_size, sequence_length, feature_size)에 따라 다름
        encoder_outputs = self.encoder(input_features=input_features, return_dict=True)
        print("Whisper encoder 지남")
        # encoder_outputs: BaseModelOutput 클래스(또는 비슷한 dict)
        # encoder_outputs.last_hidden_state: (batch_size, seq_len, hidden_size=1280)
        encoder_hidden_states = encoder_outputs.last_hidden_state
        print("Encoder output shape:", encoder_hidden_states.shape)
        # 기대: (batch_size, audio_seq_len, 1280)

        # 2. 차원 브릿지
        encoder_hidden_states = self.bridge(encoder_hidden_states)
        print("After bridge shape:", encoder_hidden_states.shape)
        # 기대: (batch_size, audio_seq_len, 2048)
        # 3. Llama token 임베딩
        #    decoder_input_ids -> (batch_size, seq_len) => embed_tokens -> (batch_size, seq_len, hidden_size)
        hidden_states = self.embed_tokens(decoder_input_ids)
        print("Decoder token emb shape:", hidden_states.shape)
        # 기대: (batch_size, tgt_seq_len, 2048)
        # 여기서는 “encoder-decoder 구조”라기보다는, Llama의 일부 레이어를 순서대로 통과시킨다고만 간단히 가정
        # 실제로는 cross-attention 추가, causal mask, rotary embedding, position ids 등 처리가 필요함
        

            # 4. Llama의 일부 레이어(맨 앞 2 + 맨 뒤 2개) 순회
        #    layer() 호출 결과는 (hidden_states, ...) 형태의 튜플이므로, 첫 번째 요소만 받아 다음 레이어로 전달
        for layer in self.decoder_layers:
            layer_outputs = layer(
                hidden_states, 
                attention_mask=attention_mask,
                # 필요하다면 encoder_hidden_states=encoder_hidden_states 등 cross-attn 인수도 추가
            )
            # LlamaDecoderLayer의 결과에서 hidden_states만 추출
            hidden_states = layer_outputs[0]

        # 4. Llama 마지막 norm
        hidden_states = self.norm(hidden_states)

        # 5. lm_head를 통해 vocab logits 예측 (선택)
        logits = self.llama.lm_head(hidden_states)

        return logits



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
processor = AutoProcessor.from_pretrained("openai/whisper-large-v2")

ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

In [4]:

if __name__ == "__main__":
    # 예시: 모델 초기화
    custom_model = WhisperEncoderLlamaDecoder(
        freeze_whisper_encoder=True,
        freeze_llama_decoder=False
    )

    # 가짜 입력
    batch_size = 1
    #dummy_input_features = torch.randn(batch_size, 80, 3000)  # (batch, mel_bins, frames) 정도 가정
    dummy_decoder_input_ids = torch.randint(0, 1000, (batch_size, 16))  # 임의 토큰
    # forward
    outputs = custom_model(
        input_features=input_features, 
        decoder_input_ids=dummy_decoder_input_ids
    )
    print(outputs.shape)  # (batch_size, seq_len, vocab_size) 형태가 기대됨


Whisper encoder 지남
Encoder output shape: torch.Size([1, 1500, 1280])
After bridge shape: torch.Size([1, 1500, 2048])
Decoder token emb shape: torch.Size([1, 16, 2048])


TypeError: cannot unpack non-iterable NoneType object