<a href="https://colab.research.google.com/github/JGG-Mwahaha/easy/blob/main/2025_07_18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ============================================
# IMDB 영화 데이터로 콘텐츠 기반 필터링 모델 구현
# ============================================

# -----------------------
# 1. 데이터 로드 & 병합
# -----------------------
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# 데이터 로드
url1 = 'https://raw.githubusercontent.com/sd12832/MoViZ/master/src/tmdb_5000_credits.csv'
url2 = 'https://raw.githubusercontent.com/sd12832/MoViZ/master/src/tmdb_5000_movies.csv'
df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

# 컬럼명 변경 후 병합
df1.columns = ['id', 'title', 'cast', 'crew']
df = pd.merge(df1, df2, on='id')
print(f"데이터셋 크기: {df.shape}")

# -----------------------
# 2. 평점 기반 추천
# -----------------------
print("\n===== 평점 기반 추천 =====")

# IMDB 가중평점 공식
C = df['vote_average'].mean()
m = df['vote_count'].quantile(0.9)

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

# 상위 10% 투표 영화만
q_movies = df.loc[df['vote_count'] >= m].copy()
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)

# 비영어권 영화
f_movies = df.loc[df['original_language'] != 'en'].copy()
f_movies['score'] = f_movies.apply(weighted_rating, axis=1)
f_movies = f_movies.sort_values('score', ascending=False)

print("\n✅ 영어권 TOP 10")
print(q_movies[['original_title', 'vote_average', 'vote_count', 'score']].head(10))

print("\n✅ 비영어권 TOP 10")
print(f_movies[['original_title', 'vote_average', 'vote_count', 'score']].head(10))

# -----------------------
# 3. 줄거리 기반 추천 (TF-IDF + 코사인 유사도)
# -----------------------
print("\n===== 줄거리 기반 추천 (TF-IDF) =====")

df['overview'] = df['overview'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['overview'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df.index, index=df['original_title']).drop_duplicates()

def get_similar_movies(title, top_n=10):
    if title not in indices:
        return f"❌ '{title}'는 데이터에 없습니다."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df[['original_title', 'overview']].iloc[movie_indices]

# 예시 실행
print("\n✅ 'The Dark Knight'와 줄거리 유사 영화 TOP 10")
display(get_similar_movies('The Dark Knight'))

# -----------------------
# 4. 메타데이터 기반 추천 (cast, crew, keywords, genres)
# -----------------------
print("\n===== 메타데이터 기반 추천 =====")

# JSON 파싱
def parse_list(x):
    try:
        return [d['name'] for d in ast.literal_eval(x)]
    except:
        return []

def get_director(x):
    try:
        for d in ast.literal_eval(x):
            if d['job'] == 'Director': return d['name']
    except:
        return ''

df['cast'] = df['cast'].apply(parse_list)
df['keywords'] = df['keywords'].apply(parse_list)
df['genres'] = df['genres'].apply(parse_list)
df['crew'] = df['crew'].apply(get_director)

# soup 생성
df['cast'] = df['cast'].apply(lambda x: ' '.join(x[:3]))
df['keywords'] = df['keywords'].apply(lambda x: ' '.join(x))
df['genres'] = df['genres'].apply(lambda x: ' '.join(x))
df['crew'] = df['crew'].apply(lambda x: x.replace(" ", ""))

df['soup'] = df['cast'] + ' ' + df['keywords'] + ' ' + df['genres'] + ' ' + df['crew']

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])
cosine_sim_meta = cosine_similarity(count_matrix, count_matrix)

indices = pd.Series(df.index, index=df['original_title']).drop_duplicates()

def get_similar_movies_by_metadata(title, top_n=10):
    if title not in indices:
        return f"❌ '{title}'는 데이터에 없습니다."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_meta[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df[['original_title', 'genres', 'cast', 'keywords']].iloc[movie_indices]

# 예시 실행
print("\n✅ 'The Dark Knight'와 메타데이터 유사 영화 TOP 10")
display(get_similar_movies_by_metadata('The Dark Knight'))

# ============================================
# END
# ============================================


데이터셋 크기: (4803, 23)

===== 평점 기반 추천 =====

✅ 영어권 TOP 10
                                     original_title  vote_average  vote_count  \
1881                       The Shawshank Redemption           8.5        8205   
662                                      Fight Club           8.3        9413   
65                                  The Dark Knight           8.2       12002   
3232                                   Pulp Fiction           8.3        8428   
96                                        Inception           8.1       13752   
3337                                  The Godfather           8.4        5893   
95                                     Interstellar           8.1       10867   
809                                    Forrest Gump           8.2        7927   
329   The Lord of the Rings: The Return of the King           8.1        8064   
1990                        The Empire Strikes Back           8.2        5879   

         score  
1881  8.059258  
662   7.939256  
6

Unnamed: 0,original_title,overview
3,The Dark Knight Rises,Following the death of District Attorney Harve...
428,Batman Returns,"Having defeated the Joker, Batman now faces th..."
3854,"Batman: The Dark Knight Returns, Part 2",Batman has stopped the reign of terror that Th...
299,Batman Forever,The Dark Knight of Gotham City confronts a das...
1359,Batman,The Dark Knight of Gotham City begins his war ...
119,Batman Begins,"Driven by tragedy, billionaire Bruce Wayne ded..."
1181,JFK,New Orleans District Attorney Jim Garrison dis...
9,Batman v Superman: Dawn of Justice,Fearing the actions of a god-like Super Hero l...
2507,Slow Burn,A district attorney (Ray Liotta) is involved i...
210,Batman & Robin,Along with crime-fighting partner Robin and ne...



===== 메타데이터 기반 추천 =====


AttributeError: 'NoneType' object has no attribute 'replace'