# 학습코드

In [1]:
!pip install wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.16.3-py3-none-any.whl.metadata (9.9 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.40.3-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting appdirs>=1.4.3 (from wandb)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Downloading wandb-0.16.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading sentry_sdk-1.40.3-py2.py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.8/257.8 kB[0m [3

In [12]:
import wandb
import random

In [13]:
import os
import glob
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from PIL import Image
from TTS.encoder.models.i2v_encoder import I2VEncoder


In [14]:
torch.cuda.is_available()

True

In [15]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_paths, audio_embeddings, transforms=None):
        self.image_paths = image_paths
        self.audio_embeddings = audio_embeddings
        self.transforms = transforms

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = load_image(image_path)
        character = "Bill_Williamson" 
        audio_embedding = self.audio_embeddings.get(character)

        return image, audio_embedding

In [16]:
def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = Compose([
        Resize(224),
        CenterCrop(224),
        ToTensor(),  # 이미지를 [0, 1] 범위의 텐서로 변환
    ])
    return transform(image)
    
def get_image_paths(root_dir, character_name):
    pattern = os.path.join(root_dir, "RedDeadRedemption2", character_name, "image", "*.jpg")
    print(pattern)
    return glob.glob(pattern)

def get_audio_embeddings(root_dir, character_name):
    audio_files = glob.glob(os.path.join(root_dir, "RedDeadRedemption2", character_name, "speaker_embedding", "*.pt"))
    if audio_files:
        return {character_name: audio_files}
    return {}
    
def encode_images(images):
    inputs = clip_processor(images=images, return_tensors="pt")
    outputs = clip_model.get_image_features(**inputs)
    return outputs


In [17]:
encoder = I2VEncoder()
encoder.load_checkpoint('/home/jupyter-attention/TTS/TTS/encoder/models/i2v_baseline.pt')

criterion = nn.CosineSimilarity(dim=1)
optimizer = optim.Adam(encoder.parameters(), lr=0.001)
root_dir = "/home/jupyter-attention/dataset_cleaned/"
character_name = "Bill_Williamson"
image_paths = get_image_paths(root_dir, character_name)
audio_embeddings = get_audio_embeddings(root_dir, character_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

/home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/image/*.jpg
Using device: cuda


In [18]:
num_epochs = 100
batch_size = 16
selected_embeddings = {}
torch.cuda.empty_cache()
encoder.to(device)

I2VEncoder(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(257, 1024)
    )
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-23): 24 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)

In [23]:
wandb.init(
    project="inu-attention",
    config={
    "learning_rate": 0.001,
    "epochs": num_epochs,
    "batch_size": batch_size,
    }
)

wandb.watch(encoder, criterion, log="all", log_freq=10)

for epoch in range(num_epochs):
    selected_embeddings = {}
    print(f"Epoch [{epoch+1}/{num_epochs}]")

    for character, files in audio_embeddings.items():
        if not files:
            continue
        selected_file = random.choice(files)
        # 오디오 임베딩을 CPU 또는 GPU로 로드
        selected_embeddings[character] = torch.load(selected_file, map_location=device)
        
        print(f"Character: {character}, Selected Audio Embedding File: {selected_file}")

    dataset = CustomDataset(image_paths, selected_embeddings)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    total_loss = 0
    for images, audio_embedding in data_loader:
        # 데이터를 현재 디바이스로 이동
        images = images.to(device)
        audio_embedding = audio_embedding.to(device)
        
        image_features = encoder.compute_embedding(images)
        loss = 1 - criterion(image_features, audio_embedding).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    print(f"Loss: {avg_loss:.4f}\n")
    wandb.log({"epoch": epoch + 1, "loss": avg_loss})
wandb.finish()


Epoch [1/100]
Character: Bill_Williamson, Selected Audio Embedding File: /home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/speaker_embedding/Bill williamson_00331.pt
Loss: 0.7344

Epoch [2/100]
Character: Bill_Williamson, Selected Audio Embedding File: /home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/speaker_embedding/Bill williamson_00352.pt
Loss: 0.6914

Epoch [3/100]
Character: Bill_Williamson, Selected Audio Embedding File: /home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/speaker_embedding/Bill williamson_00322.pt
Loss: 0.7891

Epoch [4/100]
Character: Bill_Williamson, Selected Audio Embedding File: /home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/speaker_embedding/Bill williamson_00409.pt
Loss: 0.6308

Epoch [5/100]
Character: Bill_Williamson, Selected Audio Embedding File: /home/jupyter-attention/dataset_cleaned/RedDeadRedemption2/Bill_Williamson/speaker_embedding/Bill williamso



VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▆█▇▅▇█▇▇▅▄▆▃▃▅▆▇▃▅▇▅▄▅█▅▅▄▅▁▆▃▆▆▇▃▆▆▇█▅▅

0,1
epoch,100.0
loss,0.67578


In [20]:
# 검증 데이터셋 준비
# 이 부분은 실제 검증 데이터셋에 맞게 수정해야 합니다.
val_image_paths = get_image_paths(val_root_dir, character_name)  # 검증 이미지 경로
val_audio_embeddings = get_audio_embeddings(val_root_dir, character_name)  # 검증 오디오 임베딩
val_dataset = CustomDataset(val_image_paths, val_audio_embeddings, transforms=transforms)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 모델 평가
encoder.eval()  # 모델을 평가 모드로 설정
with torch.no_grad():
    total_val_loss = 0
    for images, audio_embedding in val_data_loader:
        image_features = encoder.compute_embedding(images)
        loss = 1 - criterion(image_features, audio_embedding).mean()
        total_val_loss += loss.item()
    
    avg_val_loss = total_val_loss / len(val_data_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
encoder.train()  # 모델을 다시 학습 모드로 설정


NameError: name 'val_root_dir' is not defined