In [1]:
# --- 1. 환경 설정 ---

# 1. 라이브러리 설치
!pip install -q transformers accelerate bitsandbytes
!pip install -q ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git
!pip install -q librosa

# 2. 라이브러리 임포트
import os
import io
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torch import nn
from PIL import Image
from google.colab import files
from IPython.display import Audio, display
from pprint import pprint
from transformers import (
    ClapModel, ClapProcessor,
    Blip2ForConditionalGeneration, AutoProcessor,
    InstructBlipForConditionalGeneration,
    CLIPModel, CLIPProcessor as HFCLIPProcessor
)

# 3. Google Drive 마운트
from google.colab import drive
drive.mount('/content/drive')

print("✅ 1. 환경 설정 완료")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for clip (setup.py) ... [?25l[?25hdone
Mounted at /content/drive
✅ 1. 환경 설정 완료


In [2]:
# --- 2. 경로 및 전역 설정 ---

# === 기본 설정 ===
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE_TORCH = torch.float16 if DEVICE == "cuda" else torch.float32
TOPK = 3  # 추천할 트랙 개수

# === 파일 경로 (사용자 환경에 맞게 수정) ===
MUSIC_EMB_PATH = "/content/drive/MyDrive/Embeddings/jamendo_clap_weighted2/jamendo_clap_filtered2.npy"
MUSIC_META_PATH = "/content/drive/MyDrive/Embeddings/jamendo_clap_weighted2/jamendo_clap_filtered2.csv"
CLIP_EMOTION_CKPT = "/content/drive/MyDrive/image-music-recommendation-system_final/clip_emotion_classifier.pt"

# === 테스트할 이미지 경로 (사용자 환경에 맞게 수정) ===
IMAGE_PATHS = ["/content/drive/MyDrive/Datasets/EmoSet-118K/image/amusement/amusement_00000.jpg"]

# === 모델 체크포인트 ===
# 음악 임베딩 만들 때 쓴 것과 동일해야 함
CLAP_CKPT = "laion/clap-htsat-unfused"

# 짧은 캡션
BLIP2_CKPT = "Salesforce/blip2-flan-t5-xl"

# 긴 설명
INSTRUCTBLIP_CKPT = "Salesforce/instructblip-flan-t5-xl"

# 감정 추출 CLIP 모델
CLIP_CHECKPOINT = "openai/clip-vit-base-patch32"

# === 감정 클래스 라벨 ===
EMO_LABELS = ["amusement", "anger", "awe", "contentment", "disgust", "excitement", "fear", "sadness"]

print(f"✅ 2. 설정 완료. Device: {DEVICE}, DType: {DTYPE_TORCH}")

✅ 2. 설정 완료. Device: cuda, DType: torch.float16


In [3]:
# --- 3. 모든 AI 모델 로드 ---

# (1) CLAP
clap_model = ClapModel.from_pretrained(CLAP_CKPT).to(DEVICE).eval()
clap_processor = ClapProcessor.from_pretrained(CLAP_CKPT)
print("✅ CLAP 로드 완료.")

# (2) BLIP-2
blip2_processor = AutoProcessor.from_pretrained(BLIP2_CKPT)
blip2_model = Blip2ForConditionalGeneration.from_pretrained(BLIP2_CKPT, torch_dtype=DTYPE_TORCH, device_map="auto").eval()
print("✅ BLIP-2 로드 완료.")

# (3) InstructBLIP
instruct_processor = AutoProcessor.from_pretrained(INSTRUCTBLIP_CKPT)
instruct_model = InstructBlipForConditionalGeneration.from_pretrained(INSTRUCTBLIP_CKPT, torch_dtype=DTYPE_TORCH, device_map="auto").eval()
print("✅ InstructBLIP 로드 완료.")

# (4) Fine-tuned CLIP Emotion Classifier
class CLIPEmotionHead(nn.Module):
    def __init__(self, in_dim, num_classes):
        super().__init__()
        self.classifier = nn.Linear(in_dim, num_classes)
    def forward(self, x): return self.classifier(x)

# --- CLIP 백본 로드 ---
clip_backbone = CLIPModel.from_pretrained(CLIP_CHECKPOINT, torch_dtype=DTYPE_TORCH).to(DEVICE).eval()
clip_proc = HFCLIPProcessor.from_pretrained(CLIP_CHECKPOINT)

# --- Emotion Head 인스턴스 생성 및 가중치 로드 ---
IN_DIM = clip_backbone.config.projection_dim
emotion_head = CLIPEmotionHead(in_dim=IN_DIM, num_classes=len(EMO_LABELS)).to(DEVICE, dtype=DTYPE_TORCH).eval()

# 체크포인트(.pth)에서 가중치를 그대로 로드
ckpt = torch.load(CLIP_EMOTION_CKPT, map_location=DEVICE)

if "classifier.weight" in ckpt:
    # 모델 전체 또는 헤드 전체가 저장된 경우
    final_state_dict = {k: v for k, v in ckpt.items() if k.startswith("classifier.")}
    emotion_head.load_state_dict(final_state_dict)
else:
    # 분류기 부분만 저장된 경우 (키에 'classifier.' 접두사가 없음)
    final_state_dict = {k: v for k, v in ckpt.items() if k.startswith("classifier.")}
    if not final_state_dict:
        try:
             emotion_head.classifier.load_state_dict(ckpt)
        except RuntimeError:
             final_state_dict = {k.replace("classifier.", ""): v for k, v in ckpt.items() if "classifier." in k}
             emotion_head.classifier.load_state_dict(final_state_dict)

    else:
        emotion_head.load_state_dict(final_state_dict)
print("✅ CLIP 로드 완료.")

print("✅ 3. 모든 모델 로드 완료")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/615M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/614M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

✅ CLAP 로드 완료.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/5.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

✅ BLIP-2 로드 완료.


processor_config.json:   0%|          | 0.00/75.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/6.11G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

✅ InstructBLIP 로드 완료.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

✅ CLIP 로드 완료.
✅ 3. 모든 모델 로드 완료


In [9]:
# --- 4. 음악 DB 로드 및 핵심 로직 함수 정의 ---

# (1) 음악 DB 로드
song_embeds = torch.from_numpy(np.load(MUSIC_EMB_PATH)).float().to(DEVICE)
song_meta = pd.read_csv(MUSIC_META_PATH)
song_embeds = song_embeds / song_embeds.norm(dim=-1, keepdim=True)
print(f"✅ 음악 DB 로드 완료: {len(song_meta)} 곡")

# (2) AI 파이프라인 함수들
@torch.inference_mode()
def predict_emotion(image):
    # processor 출력도 모델 dtype으로 캐스팅
    inputs = clip_proc(images=image, return_tensors="pt").to(DEVICE, dtype=DTYPE_TORCH)

    # CLIP 비주얼 임베딩 (dtype 일치)
    img_feats = clip_backbone.get_image_features(**inputs)    # [1, D], dtype=DTYPE_TORCH

    # 로짓 → 온도 스케일 → 소프트맥스
    logits = emotion_head(img_feats)                          # [1, C], dtype=DTYPE_TORCH
    probs = torch.softmax(logits.float(), dim=-1)[0]          # 안정성 위해 softmax만 float32

    top_id = torch.argmax(probs).item()
    return {
        "label": EMO_LABELS[top_id],
        "prob": probs[top_id].item(),
        "all_probs": probs.cpu().numpy()
    }

@torch.inference_mode()
def blip2_caption(image):
    inputs = blip2_processor(images=image, return_tensors="pt").to(DEVICE, DTYPE_TORCH)
    out = blip2_model.generate(**inputs, max_new_tokens=30)
    return blip2_processor.batch_decode(out, skip_special_tokens=True)[0].strip()

@torch.inference_mode()
def instructblip_describe(image):
    prompt = "Write a detailed, vivid description of the image focusing on mood, scene, colors, lighting, and context."
    inputs = instruct_processor(images=image, text=prompt, return_tensors="pt").to(DEVICE, DTYPE_TORCH)
    out = instruct_model.generate(**inputs, max_new_tokens=160)
    return instruct_processor.batch_decode(out, skip_special_tokens=True)[0].strip()

def build_texts_for_image(pil_img):
    cap_short = blip2_caption(pil_img)
    emo = predict_emotion(pil_img)
    cap_combo = f"This image conveys {emo['label']} and feels like {emo['label']} mood. Caption: {cap_short}"
    cap_long = instructblip_describe(pil_img)
    return cap_short, cap_combo, cap_long, emo

@torch.inference_mode()
def score_all_songs_with_text(text: str):
    inputs = clap_processor(text=[text], return_tensors="pt", padding=True).to(DEVICE)
    text_emb = clap_model.get_text_features(**inputs)
    text_emb = text_emb / text_emb.norm(dim=-1, keepdim=True)
    sims = text_emb @ song_embeds.T
    return sims.squeeze(0)

def topk_emotions_from_probs(prob_array, labels, k=3):
    idx = np.asarray(prob_array).argsort()[::-1][:k]
    return [(labels[i], float(prob_array[i])) for i in idx]

def show_topk_audio(sims_1d, k=3, title="Results"):
    sims_cpu = sims_1d.detach().float().cpu().numpy()
    idx = sims_cpu.argsort()[::-1][:k]

    print(f"\n== {title} (Top-{k}) ==")

    for r, i in enumerate(idx, start=1):
        score = float(sims_cpu[i])
        row = song_meta.iloc[i]

        full_path = row.get("path", "N/A")

        try:
            dir_path, file_name = os.path.split(full_path)
            parent_dir_path, dir_name = os.path.split(dir_path)
            meaningful_path = f"{dir_name}/{file_name}"
        except:
            meaningful_path = full_path # 경로 파싱 실패 시 원본 경로 표시

        print(f"[{r:02d}] score={score:.3f} | title='{meaningful_path}'")

        try:
            display(Audio(filename=full_path))
        except Exception as e:
            print(f"  (Audio preview failed for path: {full_path}. Error: {e})")

# (3) 메인 데모 함수
def demo_recommend_3plus3(pil_image, lam=0.6, top_k=3):
    # 텍스트 쿼리 3종 생성
    cap_short, cap_combo, cap_long, emo = build_texts_for_image(pil_image)

    # 결과 헤더 출력
    print("="*100)
    print(f"[IMAGE ANALYSIS RESULTS]")
    # top-3 감정 포매팅
    emo_top3 = topk_emotions_from_probs(emo["all_probs"], EMO_LABELS, k=3)
    print("CLIP Emotions (Top-3):", ", ".join([f"{e}({p:.2f})" for e,p in emo_top3]))
    print(f"BLIP-2 Caption: {cap_short}")
    print(f"InstructBLIP Caption (Long): {cap_long}")
    print("="*100)

    # Stage-1 추천 (BLIP2+CLIP 융합 쿼리)
    s1 = score_all_songs_with_text(cap_combo)
    show_topk_audio(s1, k=top_k, title="▶ Stage-1 Recommendations (Emotion + Facts)")

    # Stage-2 재랭킹 (InstructBLIP 상세 묘사 추가)
    s2 = score_all_songs_with_text(cap_long)
    fused_scores = lam * s1 + (1.0 - lam) * s2
    show_topk_audio(fused_scores, k=top_k, title=f"▶ Stage-2 Re-ranked Recommendations (Context Enhanced)")

print("✅ 4. 핵심 로직 함수 정의 완료")

✅ 음악 DB 로드 완료: 18466 곡
✅ 4. 핵심 로직 함수 정의 완료


In [10]:
# --- 5. 데모 실행 ---

# 사용자 인터페이스 및 실행
print("--- 데모 실행 중... ---")
print("이미지 파일을 업로드 하세요:")

# 파일 업로드 API 호출
uploaded = files.upload()

# 파일이 성공적으로 업로드되었는지 확인
if uploaded:
    # 업로드된 파일 정보 가져오기
    filename = next(iter(uploaded))
    image_bytes = uploaded[filename]

    print(f"\n'{filename}' 업로드 성공.")

    # Bytes 데이터를 PIL 이미지 객체로 변환
    try:
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

        # 업로드된 이미지 미리보기
        plt.figure(figsize=(6, 6))
        plt.imshow(pil_image)
        plt.title("uploaded image")
        plt.axis('off')
        plt.show()

        # 메인 파이프라인 실행
        demo_recommend_3plus3(pil_image, lam=0.6, top_k=3)

    except Exception as e:
        print(f"❌ 처리 중 오류 발생: {e}")

else:
    print("업로드된 이미지가 없습니다.")

Output hidden; open in https://colab.research.google.com to view.