<a href="https://colab.research.google.com/github/LeeMooHeon/muhun/blob/main/final_algorithm_by_user(SVD).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# **한글**

In [2]:
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf


plt.rc('font', family='NanumGothic')
plt.rcParams['axes.unicode_minus'] =False

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
fonts-nanum is already the newest version (20200506-1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/humor-sans: caching, new cache contents: 1 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 12 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/usr/share/fonts/truetype: skipping, looped directory detected
/usr/share/fonts/truetype/humor-sans: skipping, looped directory detected
/usr/share/fonts/truetype/liberation: skipping, looped directory detected
/usr/share/fonts/truetype/

# **데이터 불러오기 및 전처리**

In [3]:
from google.colab import drive

# 1. 구글 드라이브 마운트
drive.mount('/content/drive')

# 2. 데이터 불러오기
base_path = "/content/drive/MyDrive"

links = pd.read_csv(f"{base_path}/links.csv")
movies = pd.read_csv(f"{base_path}/movies.csv")
ratings = pd.read_csv(f"{base_path}/ratings.csv")
tags = pd.read_csv(f"{base_path}/tags.csv")



# 확인
#print(links.shape, movies.shape, ratings.shape, tags.shape)
#

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

# ratings, tags의 timestamp를 datetime으로 변환
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')

# 2020년 1월 1일 이후 데이터만 추출
cutoff_date = pd.Timestamp('2020-01-01')
ratings = ratings[ratings['timestamp'] >= cutoff_date].copy()
tags = tags[tags['timestamp'] >= cutoff_date].copy()

# movies, links도 ratings 기준으로 필터링
movie_ids = ratings['movieId'].unique()
movies = movies[movies['movieId'].isin(movie_ids)].copy()
links = links[links['movieId'].isin(movie_ids)].copy()

# 마지막 날짜 확인
last_rating_date = ratings['timestamp'].max()
last_tag_date = tags['timestamp'].max()

# 확인
print("ratings shape:", ratings.shape)
#print("tags shape:", tags.shape)
#print("movies shape:", movies.shape)
#print("links shape:", links.shape)
#print("ratings 마지막 날짜:", last_rating_date)
#print("tags 마지막 날짜:", last_tag_date)


ratings shape: (4635230, 4)


# **Heavy Rater 가중치 설정**

In [5]:

user_stats = ratings.groupby("userId").agg(rating_count=("rating","count")).reset_index()
threshold = np.percentile(user_stats["rating_count"], 90)
heavy_raters = set(user_stats[user_stats["rating_count"] >= threshold]["userId"])

ratings['weight_hr'] = np.where(ratings['userId'].isin(heavy_raters), 1.2, 1.0)


# **Content Age 가중치 설정**

In [6]:

movies['release_year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)

ratings = ratings.merge(movies[['movieId','release_year']], on='movieId', how='left')
ratings['rating_year'] = ratings['timestamp'].dt.year
ratings['content_age'] = ratings['rating_year'] - ratings['release_year']

user_age_pref = ratings.groupby('userId')['content_age'].mean().reset_index()
median_age = user_age_pref['content_age'].median()

user_age_pref['age_pref'] = np.where(user_age_pref['content_age'] >= median_age, "retro", "trend")

ratings = ratings.merge(user_age_pref[['userId','age_pref']], on='userId', how='left')
ratings['weight_ca'] = np.where(ratings['age_pref']=="trend", 1.1, 1.0)



# **장르 선호도 가중치 설정**

In [7]:
# 1. 장르 분리

movies['genres_list'] = movies['genres'].str.split('|')

# ratings에 genres_list 붙이기
if 'genres_list' in ratings.columns:
    ratings = ratings.drop(columns=['genres_list'])

ratings = ratings.merge(movies[['movieId','genres_list']], on='movieId', how='left')

# 장르별로 행 분리
ratings_exploded = ratings.explode('genres_list')

# 2. 유저별 장르별 평균 평점

user_genre_pref = (
    ratings_exploded
    .groupby(['userId','genres_list'])['rating']
    .mean()
    .reset_index()
)

# 3. 각 유저별 상위 10% 장르 판별

def top_genres(df):
    cutoff = np.percentile(df['rating'], 90)  # 상위 10% 기준
    df['is_top_genre'] = np.where(df['rating'] >= cutoff, 1, 0)
    return df

user_genre_pref = (
    user_genre_pref
    .groupby('userId')
    .apply(top_genres)
    .reset_index(drop=True)
)

# 4. ratings에 Top Genre 플래그/가중치 부여

ratings_exploded = ratings_exploded.merge(
    user_genre_pref[['userId','genres_list','is_top_genre']],
    on=['userId','genres_list'], how='left'
)

ratings_exploded['weight_tg'] = np.where(ratings_exploded['is_top_genre']==1, 1.2, 1.0)


  .apply(top_genres)


# **가중치 설정**

In [8]:
# 최종 가중치
ratings_exploded['final_weight'] = (
    ratings_exploded['weight_hr'] *
    ratings_exploded['weight_ca'] *
    ratings_exploded['weight_tg']
)

ratings_exploded['weighted_rating'] = ratings_exploded['rating'] * ratings_exploded['final_weight']

# 가중치 정규화
max_rating = ratings_exploded['weighted_rating'].max()
ratings_exploded['weighted_rating_norm'] = (
    ratings_exploded['weighted_rating'] / max_rating * 5
)


# **Random Search for 가중치**

In [9]:
!pip install scikit-surprise==1.1.3
!pip install numpy==1.26.4  # surprise는 numpy<2 필요
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# -----------------------------
# 1. 데이터 준비 (weighted_rating_norm 사용)
# -----------------------------
# surprise 형식: user, item, rating
svd_data = ratings_exploded[['userId', 'movieId', 'weighted_rating_norm']]

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(svd_data, reader)

# -----------------------------
# 2. Train/Test Split
# -----------------------------
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# -----------------------------
# 3. SVD 모델 학습
# -----------------------------
model = SVD(n_factors=100, random_state=42)
model.fit(trainset)

# -----------------------------
# 4. 모델 평가
# -----------------------------
predictions = model.test(testset)
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))

# -----------------------------
# 5. 추천 함수
# -----------------------------
def recommend_for_user_svd(user_id, top_n=10):
    # 이미 본 영화
    seen_movies = svd_data[svd_data['userId']==user_id]['movieId'].unique()

    # 전체 영화에 대한 예측
    all_movies = svd_data['movieId'].unique()
    preds = []
    for mid in all_movies:
        if mid not in seen_movies:
            preds.append((mid, model.predict(user_id, mid).est))

    # 정렬 후 top_n 추출
    preds = sorted(preds, key=lambda x: x[1], reverse=True)[:top_n]
    preds_df = pd.DataFrame(preds, columns=['movieId','predicted_rating'])

    return preds_df.merge(movies[['movieId','title']], on='movieId', how='left')

# -----------------------------
# 6. 추천 테스트
# -----------------------------
print(recommend_for_user_svd(130520, top_n=10))


Collecting scikit-surprise==1.1.3
  Using cached scikit-surprise-1.1.3.tar.gz (771 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp312-cp312-linux_x86_64.whl size=3513885 sha256=be083f061cf36b9bc6dc2c91bb297ca7c981cef1c5f04ec1872a173c34450117
  Stored in directory: /root/.cache/pip/wheels/ee/08/67/4176eedbed1c63c15db21a526f1893ca43ee8453182a239afc
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
  Attempting uninstall: scikit-surprise
    Found existing installation: scikit-surprise 1.1.4
    Uninstalling scikit-surprise-1.1.4:
      Successfully uninstalled scikit-surprise-1.1.4
Successfully installed scikit-surprise-1.1.3
RMSE: 0.4678
RMSE: 0.4677924111459172
MAE:  0.3492
MAE: 0.3491655402134502
   movieId  predicted_rating  \
0    82143          4.09

# **간단 알고리즘**

In [10]:
# -----------------------------
# 1. (userId, movieId) 단위로 집계
# -----------------------------
user_movie_ratings = (
    ratings_exploded
    .groupby(['userId','movieId'])['weighted_rating']
    .mean()   # 장르 explode된 영화들 평균
    .reset_index()
)

# -----------------------------
# 2. User-Item Matrix 생성
# -----------------------------
user_item_matrix = user_movie_ratings.pivot(
    index="userId", columns="movieId", values="weighted_rating"
).fillna(0)

# -----------------------------
# 3. KNN 모델 학습 (사용자 기반)
# -----------------------------
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

sparse_matrix = csr_matrix(user_item_matrix.values)

model_knn = NearestNeighbors(metric="cosine", algorithm="brute")
model_knn.fit(sparse_matrix)

# -----------------------------
# 4. 추천 함수 정의
# -----------------------------
def recommend_for_user(user_id, n_neighbors=5, top_n=10):
    if user_id not in user_item_matrix.index:
        raise ValueError(f"userId {user_id} not found in data")

    # 유저 인덱스 찾기
    user_index = user_item_matrix.index.get_loc(user_id)

    # 가까운 이웃 유저 탐색
    distances, indices = model_knn.kneighbors(
        sparse_matrix[user_index], n_neighbors=n_neighbors+1
    )

    # 자기 자신 제외
    neighbor_ids = [user_item_matrix.index[i] for i in indices.flatten() if i != user_index]

    # 이웃 유저들의 평점 가져오기
    neighbor_ratings = user_movie_ratings[user_movie_ratings['userId'].isin(neighbor_ids)]

    # 내가 이미 본 영화 제외
    seen_movies = user_movie_ratings[user_movie_ratings['userId']==user_id]['movieId'].unique()

    candidate_movies = (
        neighbor_ratings[~neighbor_ratings['movieId'].isin(seen_movies)]
        .groupby('movieId')['weighted_rating']
        .mean()
        .sort_values(ascending=False)
        .head(top_n)
        .reset_index()
    )

    # 영화 제목 붙이기
    candidate_movies = candidate_movies.merge(
        movies[['movieId','title']], on='movieId', how='left'
    )

    return candidate_movies


In [11]:
import random

# 현재 user_item_matrix에 존재하는 유저들
all_users = user_item_matrix.index.tolist()

# 랜덤으로 한 명 뽑기
random_user = random.choice(all_users)
print("추천 테스트 유저 ID:", random_user)

# 추천 실행
print(recommend_for_user(random_user, n_neighbors=5, top_n=10))


추천 테스트 유저 ID: 69476
   movieId  weighted_rating                                              title
0   171495             7.92                                             Cosmos
1   183869             7.92                                  Hereditary (2018)
2     6947             7.04  Master and Commander: The Far Side of the Worl...
3    55052             7.04                                   Atonement (2007)
4     1682             6.60                            Truman Show, The (1998)
5     1485             6.60                                   Liar Liar (1997)
6    55721             6.60                Elite Squad (Tropa de Elite) (2007)
7   140267             6.60                                   The Witch (2015)
8    94959             6.60                            Moonrise Kingdom (2012)
9     1266             6.60                                  Unforgiven (1992)
