# 상품 추천 결과 생성 및 DB 입력용 데이터 추출

# Step 1: 필요한 라이브러리 설치

In [1]:
!pip install transformers
!pip install sentence-transformers
!pip install torch
!pip install Pillow
!pip install tqdm


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

#  Step 2: CSV 파일 업로드


In [2]:
from google.colab import files
uploaded = files.upload()

Saving store_product_DB.csv to store_product_DB.csv


# Step 3: 데이터 불러오기 및 전처리

In [3]:
import pandas as pd

df = pd.read_csv("store_product_DB.csv")
df = df.fillna("")  # NaN 방지
df["combined_text"] = df["BRAND_ID"].astype(str) + " " + df["NAME"] + " " + df["DESCRIPTION"] + " " + df["CATEGORY"] + " " + df["STYLE"] + " " + df["FIT"]


# Step 4: 모델 불러오기

In [4]:
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
from io import BytesIO
import numpy as np
import requests
from tqdm import tqdm

kobert = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

# Step 5: 임베딩 생성 함수

In [5]:
SEASONS = ["봄", "여름", "가을", "겨울"]

def encode_text(text):
    return kobert.encode([text], normalize_embeddings=True)[0]

def encode_image(image_url):
    try:
        response = requests.get(image_url, timeout=5)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt")
        with torch.no_grad():
            img_vec = clip_model.get_image_features(**inputs)
            img_vec = img_vec / img_vec.norm(dim=-1, keepdim=True)
        return img_vec.squeeze().cpu().numpy()
    except:
        return np.zeros(512)

def encode_season(season_str):
    return np.array([1 if s in season_str else 0 for s in SEASONS], dtype=np.float32)

def build_embedding(row):
    text_vec = encode_text(row["combined_text"])
    img_vec = encode_image(row["IMAGE_URL"])
    season_vec = encode_season(row["SEASON"])
    combined = np.concatenate([text_vec, img_vec, season_vec])
    norm = np.linalg.norm(combined)
    return combined / norm if norm != 0 else combined


# Step 6: 유사도 계산

In [6]:
embeddings = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    vec = build_embedding(row)
    embeddings.append((row["ID"], vec))

similarities = []
for i, (id1, vec1) in enumerate(embeddings):
    scores = []
    for j, (id2, vec2) in enumerate(embeddings):
        if id1 == id2:
            continue
        score = np.dot(vec1, vec2)
        scores.append((id2, score))
    scores.sort(key=lambda x: x[1], reverse=True)
    top10 = scores[:10]
    for target_id, score in top10:
        similarities.append({
            "SOURCE_ID": id1,
            "TARGET_ID": target_id,
            "SCORE": round(score, 4)
        })



  0%|          | 0/32 [00:00<?, ?it/s][A
  3%|▎         | 1/32 [00:03<01:39,  3.19s/it][A
  6%|▋         | 2/32 [00:03<00:50,  1.67s/it][A
  9%|▉         | 3/32 [00:04<00:35,  1.22s/it][A
 12%|█▎        | 4/32 [00:05<00:32,  1.17s/it][A
 16%|█▌        | 5/32 [00:06<00:30,  1.14s/it][A
 19%|█▉        | 6/32 [00:07<00:26,  1.02s/it][A
 22%|██▏       | 7/32 [00:08<00:22,  1.12it/s][A
 25%|██▌       | 8/32 [00:09<00:21,  1.10it/s][A
 28%|██▊       | 9/32 [00:09<00:20,  1.14it/s][A
 31%|███▏      | 10/32 [00:10<00:16,  1.31it/s][A
 34%|███▍      | 11/32 [00:11<00:16,  1.29it/s][A
 38%|███▊      | 12/32 [00:11<00:15,  1.31it/s][A
 41%|████      | 13/32 [00:12<00:14,  1.35it/s][A
 44%|████▍     | 14/32 [00:13<00:12,  1.49it/s][A
 47%|████▋     | 15/32 [00:13<00:11,  1.47it/s][A
 50%|█████     | 16/32 [00:15<00:14,  1.10it/s][A
 53%|█████▎    | 17/32 [00:16<00:16,  1.09s/it][A
 56%|█████▋    | 18/32 [00:18<00:16,  1.15s/it][A
 59%|█████▉    | 19/32 [00:19<00:16,  1.25s/it]

# Step 7: DB 입력용 SQL 출력

In [7]:
from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

insert_sqls = []
for i, rec in enumerate(similarities):
    insert_sqls.append(f"INSERT INTO RECOMMEND_RESULT (ID, SOURCE_ID, TARGET_ID, SCORE, CREATED_AT, CREATED_BY) "
                       f"VALUES (RECOMMEND_RESULT_SEQ.NEXTVAL, {rec['SOURCE_ID']}, {rec['TARGET_ID']}, "
                       f"{rec['SCORE']}, TO_TIMESTAMP('{now}', 'YYYY-MM-DD HH24:MI:SS'), 0);")

# 저장
with open("recommend_results.sql", "w", encoding="utf-8") as f:
    f.write("\n".join(insert_sqls))


# Step 8: SQL 다운로드

In [8]:
from google.colab import files
files.download("recommend_results.sql")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>