코드 참고 및 출처
https://velog.io/@hwanython/Movielens-Recommendation-%EB%AA%A8%EB%8D%B8-%ED%95%B4%EB%B3%B4%EA%B8%
https://github.com/pgvector/pgvector

### Library Import

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim 
import psycopg2
import pgvector
from implicit.als import AlternatingLeastSquares
import os
from scipy.sparse import csr_matrix
from psycopg2.extras import execute_values
from pgvector.psycopg2 import register_vector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings(action='ignore')

# implicit 라이브러리에서 권장하고 있는 부분
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

### Data Preparation

In [2]:
# 데이터 준비
path = './'
ratings = pd.read_csv(path + 'ratings.csv', low_memory=False, sep = '|')
movies = pd.read_csv(path + 'movies.csv', low_memory=False, sep = '|')

ratings = pd.merge(ratings, movies[['movieId', 'title']], how='left', on='movieId')
ratings = ratings[ratings['title'].notnull()].reset_index(drop=True)

# 태그 데이터 로드
tags = pd.read_csv('tags.csv', sep = '|')

# 영화별 태그를 하나의 문자열로 결합
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

### Vectorization

In [3]:
### als 기반 유저 벡터와 아이템 벡터 생성
# rating 3 이상만 사용
filtered_ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(filtered_ratings)

print(f'orginal_data_size: {len(ratings)}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / len(ratings):.2%}')

filtered_ratings.rename(columns={'rating':'count'}, inplace=True)
filtered_ratings.drop(columns = ['timestamp', 'title'], axis =1 , inplace = True)

movies_unique = movies['title'].unique()
title_to_idx = {k:v for k,v in enumerate(movies_unique)}

num_user = filtered_ratings['userId'].nunique()
num_movie = filtered_ratings['movieId'].nunique()
print(num_user)
num_movie

filtered_ratings=filtered_ratings.astype('int64')

csr_data = csr_matrix((filtered_ratings["count"], (filtered_ratings.userId, filtered_ratings.movieId)))

# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose.)
csr_data_transpose = csr_data.T
csr_data_transpose

# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

# 모델 훈련
als_model.fit(csr_data_transpose)

# 사용자-아이템 상호작용 행렬을 분해하여 숨겨진 요인(latent factors)을 학습 -> 사용자와 아이템 벡터의 내적을 통해 추천
# name_vector, movie_vector = als_model.user_factors[1], als_model.item_factors[610]
# np.dot(name_vector, movie_vector)


## 태그 벡터 생성
# TF-IDF 벡터라이저를 사용해 태그를 벡터화
vectorizer = TfidfVectorizer(stop_words='english')
tag_vectors = vectorizer.fit_transform(tags_grouped['tag'])

# 태그 벡터를 리스트로 변환
tag_vectors_list = tag_vectors.toarray().tolist()

orginal_data_size: 100836, filtered_data_size: 81763
Ratio of Remaining Data is 81.09%
609


  0%|          | 0/15 [00:00<?, ?it/s]

### DB INSERT

In [4]:
## DB INSERT
database = 
user = 
password = 
host = 
port = 
data_path = "./"

# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(
        database=database, user=user, password=password, host=host, port=port
    )
cur = conn.cursor()

# #install pgvector
cur.execute("CREATE EXTENSION IF NOT EXISTS vector");
conn.commit()


# Register the vector type with psycopg2
register_vector(conn)

# 영화 임베딩 테이블 생성
cur.execute("""
    CREATE TABLE IF NOT EXISTS movie_embeddings (
        movieId INTEGER UNIQUE NOT NULL,
        title VARCHAR,
        movieEmbedding VECTOR(100),
        FOREIGN KEY (movieId) REFERENCES movies(id)
    );
""")
conn.commit()

# 영화 임베딩 데이터 준비
movie_titles = {int(row['movieId']): row['title'] for _, row in movies.iterrows()}
data_list = [(movieid, movie_titles.get(int(movieid), "Unknown"), als_model.item_factors[movieid].tolist())
             for movieid in range(len(als_model.item_factors))]
data_list.pop(0)

# 데이터베이스에 배치 삽입
execute_values(cur, "INSERT INTO movie_embeddings (movieId, title, movieEmbedding) VALUES %s ON CONFLICT (movieId) DO NOTHING", data_list)
conn.commit()


# 사용자 임베딩 테이블 생성
cur.execute("""
    CREATE TABLE IF NOT EXISTS user_embeddings (
        userId INTEGER UNIQUE NOT NULL,
        userEmbedding VECTOR(100),
        FOREIGN KEY (userId) REFERENCES users(id)
    );
""")
conn.commit()

# 사용자 임베딩 데이터 준비
user_data_list = [(user_id, als_model.user_factors[user_id].tolist())
                  for user_id in range(len(als_model.user_factors))]
user_data_list.pop(0)

# 데이터베이스에 배치 삽입
execute_values(cur, "INSERT INTO user_embeddings (userId, userEmbedding) VALUES %s ON CONFLICT (userId) DO NOTHING", user_data_list)
conn.commit()

# 태그 임베딩 테이블 생성
cur.execute("""
    CREATE TABLE IF NOT EXISTS tag_embeddings (
        movieId INTEGER PRIMARY KEY,
        tagEmbedding VECTOR(%s),
        FOREIGN KEY (movieId) REFERENCES movies(id),
    );
""", (len(tag_vectors_list[0]),))
conn.commit()

# 태그 임베딩 데이터 준비
data_list = [(int(tags_grouped['movieId'][i]), tag_vectors_list[i]) for i in range(len(tag_vectors_list))]
data_list.pop(0)

# 데이터베이스에 배치 삽입
execute_values(cur, "INSERT INTO tag_embeddings (movieId, tagEmbedding) VALUES %s ON CONFLICT (movieId) DO NOTHING", data_list)
conn.commit()


# 연결 종료
cur.close()
conn.close()

### TEST CODE

In [6]:
# 사용자별로 선호할 만한 영화 추천
import psycopg2
import numpy as np
from scipy.spatial.distance import cosine

database = 
user = 
password = 
host = 
port = 
data_path = "./"

# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(
        database=database, user=user, password=password, host=host, port=port
    )
cur = conn.cursor()

# 사용자 임베딩 데이터 가져오기
def get_user_embedding(user_id):
    cur.execute("SELECT userEmbedding FROM user_embeddings WHERE userId = %s", [user_id])
    embedding = cur.fetchone()[0]
    return np.array(embedding)

# 영화 임베딩 데이터 가져오기
def get_movie_embeddings():
    cur.execute("SELECT movieId, title, movieEmbedding FROM movieEmbeddings")
    all_movies = cur.fetchall()
    movie_embeddings = {movieid: (title, np.array(movie_embedding)) for movieid, title, movie_embedding in all_movies}
    return movie_embeddings

# 유사한 영화 추천 함수
def recommend_movies(user_id, top_n=5):
    user_embedding = get_user_embedding(user_id)
    movie_embeddings = get_movie_embeddings()
    
    scores = {movieid: (title, np.dot(user_embedding, movie_embedding)) for movieid, (title, movie_embedding) in movie_embeddings.items()}
    recommended_movies = sorted(scores.items(), key=lambda x: x[1][1], reverse=True)[:top_n]
    
    return recommended_movies

# 예시: 사용자 ID가 1인 사용자를 위한 영화 추천
recommended_movies = recommend_movies(1)
for movie in recommended_movies:
    print(f"Movie ID: {movie[0]}, Title: {movie[1][0]}, Score: {movie[1][1]:.4f}")


# 연결 종료
cur.close()
conn.close()

Movie ID: 167, Title: Unknown, Score: 1.4864
Movie ID: 381, Title: When a Man Loves a Woman, Score: 1.4550
Movie ID: 290, Title: Once Were Warriors, Score: 1.2186
Movie ID: 219, Title: Cure, The, Score: 1.2170
Movie ID: 247, Title: Heavenly Creatures, Score: 1.1716


In [7]:
## 영화 아이디로 검색 시, 유사한 tag 기반으로 영화 추천
import psycopg2
import numpy as np
from scipy.spatial.distance import cosine

database = 
user = 
password = 
host = 
port = 
data_path = "./"

# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(
        database=database, user=user, password=password, host=host, port=port
    )
cur = conn.cursor()


# 영화 임베딩 데이터 가져오기
def get_tag_embedding(movieid):
    cur.execute("SELECT tagEmbedding FROM tag_embeddings WHERE movieId = %s", [movieid])
    result = cur.fetchone()
    if result:
        embedding = result[0]
        return np.array(embedding)
    return None

# 유사한 영화 추천 함수
def recommend_similar_movies(movieid, top_n=5):
    target_embedding = get_tag_embedding(movieid)
    if target_embedding is None:
        print(f"No embedding found for movieid: {movieid}")
        return []
    
    cur.execute("""
        SELECT te.movieId, m.title, te.tagEmbedding
        FROM tagEmbedding te
        JOIN movies m ON te.movieId = m.id
        WHERE te.movieId != %s
    """, [movieid])
    all_movies = cur.fetchall()
    
    similarities = []
    for movie in all_movies:
        other_movieid, title, embedding = movie
        if embedding is not None:
            embedding = np.array(embedding)
            similarity = 1 - cosine(target_embedding, embedding)
            similarities.append((other_movieid, title, similarity))
    
    # 유사도가 높은 순으로 정렬하여 상위 N개 반환
    similar_movies = sorted(similarities, key=lambda x: x[2], reverse=True)[:top_n]
    return similar_movies

# 예시: 영화 ID가 50인 영화와 유사한 영화 추천
similar_movies = recommend_similar_movies(50)
for movie in similar_movies:
    print(f"Movie ID: {movie[0]}, Title: {movie[1]}, Similarity: {movie[2]:.4f}")

# 연결 종료
cur.close()
conn.close()

Movie ID: 199, Title: Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les), Similarity: 1.0000
Movie ID: 3160, Title: Magnolia, Similarity: 1.0000
Movie ID: 6273, Title: In a Lonely Place, Similarity: 1.0000
Movie ID: 25886, Title: Random Harvest, Similarity: 1.0000
Movie ID: 1625, Title: Game, The, Similarity: 0.5010


In [13]:
## 태그 임베팅 벡터 기반 유사한 영화 검색
import psycopg2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

database = 
user = 
password = 
host = 
port = 
data_path = "./"

# Connect to PostgreSQL database in Timescale using connection string
conn = psycopg2.connect(
        database=database, user=user, password=password, host=host, port=port
    )
cur = conn.cursor()

# 영화 태그 임베딩 데이터 가져오기
def get_all_movie_embeddings():
    cur.execute("""
        SELECT te.movieId, m.title, te.tagEmbedding
        FROM tag_embeddings te
        JOIN movies m ON te.movieid = m.movieId
    """)
    return cur.fetchall()

# 태그 임베딩 벡터화 함수
def vectorize_tags(tag_list):
    vectorizer = TfidfVectorizer(stop_words='english')
    tag_vectors = vectorizer.fit_transform(tag_list)
    return tag_vectors, vectorizer

# 사용자 입력 태그를 벡터화하여 유사한 영화 추천 함수
def recommend_movies_by_tags(input_tags, top_n=5):
    all_movies = get_all_movie_embeddings()
    
    # 태그 임베딩 벡터와 제목을 분리
    movieids, titles, embeddings = zip(*[(movieid, title, " ".join(map(str, embedding))) for movieid, title, embedding in all_movies])
    
    # 태그 벡터화
    tag_texts = list(embeddings)
    _, vectorizer = vectorize_tags(tag_texts)
    input_vector = vectorizer.transform([input_tags]).toarray()[0]
    
    similarities = []
    for movieid, title, embedding in zip(movieids, titles, tag_texts):
        embedding_vector = vectorizer.transform([embedding]).toarray()[0]
        similarity = 1 - cosine(input_vector, embedding_vector)
        similarities.append((movieid, title, similarity))
    
    # 유사도가 높은 순으로 정렬하여 상위 N개 반환
    similar_movies = sorted(similarities, key=lambda x: x[2], reverse=True)[:top_n]
    return similar_movies

# 예시: 사용자가 입력한 태그로 유사한 영화 추천
input_tags = "action adventure"
similar_movies = recommend_movies_by_tags(input_tags)
for movie in similar_movies:
    print(f"Movie ID: {movie[0]}, Title: {movie[1]}, Similarity: {movie[2]:.4f}")

# 연결 종료
cur.close()
conn.close()

Movie ID: 2, Title: Jumanji, Similarity: 1.0000
Movie ID: 3, Title: Grumpier Old Men, Similarity: 1.0000
Movie ID: 5, Title: Father of the Bride Part II, Similarity: 1.0000
Movie ID: 7, Title: Sabrina, Similarity: 1.0000
Movie ID: 11, Title: American President, The, Similarity: 1.0000
