In [1]:
# -*- coding: utf-8 -*-
import re, json
import pandas as pd
import numpy as np
from difflib import get_close_matches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CSV_PATH = "./movies_metadata.csv"

# 1) Load
df = pd.read_csv(CSV_PATH, low_memory=False)

# 2) JSON-like string -> extract "name" list (robust)
def parse_name_list(x):
    """
    TMDB CSV의 'genres', 'production_companies' 같은 컬럼은
    보통 "[{'id':..,'name':'Animation'}, ...]" 문자열입니다.
    여기서 name만 안전하게 뽑아 'Animation Comedy'처럼 합쳐 반환.
    """
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    if isinstance(x, dict):
        items = [x]
    elif isinstance(x, (list, tuple)):
        items = x
    elif isinstance(x, str):
        s = x.strip()
        if not s:
            return ""
        try:
            parsed = json.loads(s.replace("'", '"'))
            if isinstance(parsed, dict):
                items = [parsed]
            elif isinstance(parsed, (list, tuple)):
                items = parsed
            else:
                return ""  # "0" 같은 스칼라로 파싱된 경우 무시
        except Exception:
            # fallback: 정규식으로 name만 뽑기
            items = []
            for m in re.finditer(r"'name':\s*'([^']+)'|\"name\":\s*\"([^\"]+)\"", s):
                name = m.group(1) or m.group(2)
                if name:
                    items.append({"name": name})
    else:
        return ""

    names = []
    for it in items:
        if isinstance(it, dict):
            n = it.get("name")
            if n:
                names.append(str(n))
        elif isinstance(it, str):
            names.append(it)
    return " ".join(names)

# 3) (기본) 필드 가중치 — 벡터라이저 학습에 사용
BASE_WEIGHTS = {
    "genres": 2.0,
    "overview": 1.0,
    "tagline": 1.2,
    "collection": 1.2,
    "prod_companies": 0.8,
    "prod_countries": 0.6,
    "spoken_languages": 0.6,
    "original_language": 0.4
}

def build_weighted_text_row(row, w=BASE_WEIGHTS):
    """
    한 영화(row)의 텍스트를 필드별 가중치로 묶어 하나의 문자열로 만듭니다.
    가중치는 '텍스트 반복' 방식으로 간단히 반영합니다.
    """
    parts = []

    def rep(txt, weight):
        if txt and weight > 0:
            parts.extend([txt] * int(round(weight)))

    # genres
    rep(parse_name_list(row.get("genres")), w.get("genres", 1.0))
    # overview
    ov = row.get("overview"); rep(ov if isinstance(ov, str) else "", w.get("overview", 1.0))
    # tagline
    tg = row.get("tagline"); rep(tg if isinstance(tg, str) else "", w.get("tagline", 1.0))
    # belongs_to_collection (collection name)
    coll = row.get("belongs_to_collection")
    if isinstance(coll, str) and coll.strip():
        m = re.search(r"'name':\s*'([^']+)'|\"name\":\s*\"([^\"]+)\"", coll)
        if m:
            rep(m.group(1) or m.group(2), w.get("collection", 1.0))
    # production_companies / countries / spoken_languages
    rep(parse_name_list(row.get("production_companies")), w.get("prod_companies", 1.0))
    rep(parse_name_list(row.get("production_countries")), w.get("prod_countries", 1.0))
    rep(parse_name_list(row.get("spoken_languages")), w.get("spoken_languages", 1.0))
    # original_language
    ol = row.get("original_language"); rep(ol if isinstance(ol, str) else "", w.get("original_language", 1.0))

    return " ".join(parts)

# 4) 코퍼스 텍스트(기본 가중치) 생성 및 TF-IDF 학습
df["weighted_text"] = df.apply(build_weighted_text_row, axis=1).fillna("").str.lower()

# 공백 기반 토큰화(이미 전처리 끝난 문자열이므로)
vectorizer = TfidfVectorizer(
    token_pattern=r"[^ ]+",      # 공백으로만 토큰 분리
    ngram_range=(1, 2),          # unigram + bigram
    max_features=150_000         # 필요시 조절
)
X = vectorizer.fit_transform(df["weighted_text"])

# 5) 제목 → 인덱스 매핑 & 근사 매칭
title_to_idx = {str(t).strip().lower(): i
                for i, t in enumerate(df["title"].fillna("").astype(str))
                if str(t).strip()}

def resolve_title(user_title: str):
    key = user_title.strip().lower()
    if key in title_to_idx:
        return key
    # 오타/부분입력 대응
    cand = get_close_matches(key, list(title_to_idx.keys()), n=1, cutoff=0.6)
    return cand[0] if cand else None

# 6) 추천 함수
def recommend(
    title: str,
    topn: int = 10,
    weights: dict | None = None,
    include_cols=("title", "genres", "overview", "tagline", "original_language")
):
    """
    title: 기준 영화 제목 (오타/부분입력 허용)
    topn: 추천 개수
    weights: 질의마다 덮어쓸 가중치 dict (None이면 기본 가중치 사용)
             예) {"genres":3.0, "overview":0.7, "prod_companies":1.5}
    include_cols: 결과에 보여줄 컬럼들
    """
    key = resolve_title(title)
    if not key:
        return pd.DataFrame({"error":[f"제목 '{title}'을(를) 찾지 못했습니다."]})

    idx = title_to_idx[key]

    # 쿼리 전용 가중치를 쓰는 경우: 쿼리 1개 행만 재가중 → 벡터라이저로 변환
    if weights:
        qi = df.loc[idx:idx].copy()
        qi["weighted_text"] = qi.apply(lambda r: build_weighted_text_row(r, w=weights), axis=1).fillna("").str.lower()
        xq = vectorizer.transform(qi["weighted_text"])
    else:
        xq = X[idx]

    # 코사인 유사도 (한 벡터 vs 전체 행렬)
    sims = cosine_similarity(xq, X)[0]
    sims[idx] = -1.0  # 자기 자신 제외

    # Top-N
    top_idx = np.argpartition(-sims, range(topn))[:topn]
    top_idx = top_idx[np.argsort(-sims[top_idx])]

    cols = [c for c in include_cols if c in df.columns]
    res = df.loc[top_idx, cols].copy()
    res.insert(1, "similarity", np.round(sims[top_idx], 4))
    res = res.reset_index(drop=True)
    return res

# 7) 사용 예시
print("=== 기본 가중치로 [Toy Story] 유사작 Top-5 ===")
display(recommend("Toy Story", topn=5))

print("=== 장르를 더 강하게(genres*3), 줄거리는 약하게(overview*0.7) ===")
custom_w = {**BASE_WEIGHTS, "genres": 3.0, "overview": 0.7}
display(recommend("Toy Story", topn=5, weights=custom_w))

print("=== 제작국가/언어 비중을 키운 예시(prod_countries*2, spoken_languages*2) ===")
custom_w2 = {**BASE_WEIGHTS, "prod_countries": 2.0, "spoken_languages": 2.0}
display(recommend("Toy Story", topn=5, weights=custom_w2))


=== 기본 가중치로 [Toy Story] 유사작 Top-5 ===


Unnamed: 0,title,similarity,genres,overview,tagline,original_language
0,Toy Story 2,0.3445,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Andy heads off to Cowboy Camp, leaving his toy...",The toys are back!,en
1,Superstar Goofy,0.2944,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,,it
2,Small Fry,0.2474,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",A fast food restaurant mini variant of Buzz fo...,,en
3,Toy Story 3,0.2472,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...","Woody, Buzz, and the rest of Andy's toys haven...",No toy gets left behind.,en
4,Dug's Special Mission,0.224,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Dug's Special Mission will give ""a little bit ...",,en


=== 장르를 더 강하게(genres*3), 줄거리는 약하게(overview*0.7) ===


Unnamed: 0,title,similarity,genres,overview,tagline,original_language
0,Superstar Goofy,0.4187,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,,it
1,Toy Story 2,0.367,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Andy heads off to Cowboy Camp, leaving his toy...",The toys are back!,en
2,Dug's Special Mission,0.2784,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Dug's Special Mission will give ""a little bit ...",,en
3,Botsman i Popugay,0.2767,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animated story about the adventures of boatswa...,,ru
4,Small Fry,0.2616,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",A fast food restaurant mini variant of Buzz fo...,,en


=== 제작국가/언어 비중을 키운 예시(prod_countries*2, spoken_languages*2) ===


Unnamed: 0,title,similarity,genres,overview,tagline,original_language
0,Toy Story 2,0.345,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Andy heads off to Cowboy Camp, leaving his toy...",The toys are back!,en
1,Superstar Goofy,0.2913,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,,it
2,Small Fry,0.251,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",A fast food restaurant mini variant of Buzz fo...,,en
3,Toy Story 3,0.249,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...","Woody, Buzz, and the rest of Andy's toys haven...",No toy gets left behind.,en
4,Dug's Special Mission,0.2288,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Dug's Special Mission will give ""a little bit ...",,en
