# 장르 속성을 이용한 영화 컨텐츠 기반 필터링

In [33]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('max_colwidth',100)
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
movies = pd.read_csv('/home/bmw/.cache/kagglehub/datasets/tmdb/tmdb-movie-metadata/versions/2/tmdb_5000_movies.csv')
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [3]:
movies_df = movies[['id','title','genres','vote_average','vote_count','popularity','keywords','overview']]

In [5]:
movies_df[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."


In [6]:
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

In [10]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])

movies_df[['genres','keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


## 장르 컨텐츠 유사도 측정
- 문자열로 변환된 genres 컬럼을 count 기반으로 피처 벡터화 변환
- genres 문자열을 피처 벡트화 행렬로 변환환 데이터 세트를 코사인 유사도를 통해 비교
- 데이터 세트의 레코드별로 타 레코드와 장르에서 코사인 유사도 값을 가지는 객체를 생성
- 장르 유사도가 높은 영화 중에 평점이 높은 순으로 영화를 추천

### 컬럼을 문자열로 변환 후 CountVectorizer 이용하여 피처 벡터 행렬 변환

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df = 0.0 ,ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

genre_sim_sorted_ind = genre_sim.argsort()[:,::-1]
print(genre_sim_sorted_ind[:1])

similar_movies = genre_sim_sorted_ind[0, :5]
print("첫 번째 영화와 유사한 영화들:", similar_movies)

def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    # 인자로 입력된 movies_df DataFrame에서 'title' 컬럼이 입력된 title_name값인 DataFrame 추출
    title_movie = df[df['title'] == title_name]
    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n개의 index 추출
    title_index = title_movie.index
    similar_indexes = sorted_ind[title_index, :(top_n)]

    # 추출된 top_n index 출력, top_n index는 2차원 데이터임
    # dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)

    return df.iloc[similar_indexes]

movies_df[movies_df['title'] == 'The Godfather'].index

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)

similar_movies[['title','vote_average']]

(4803, 276)
(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]
[[   0 3494  813 ... 3038 3037 2401]]
첫 번째 영화와 유사한 영화들: [   0 3494  813  870   46]
[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


### 정리된 코드

In [None]:
import warnings#
warnings.filterwarnings('ignore')

# 데이터 분석에 필요한 라이브러리 임포트
import pandas as pd
pd.set_option('max_colwidth', 100)  # Pandas 출력 옵션 설정
import numpy as np

# 영화 데이터 로드 및 기본 확인
movies = pd.read_csv('/home/bmw/.cache/kagglehub/datasets/tmdb/tmdb-movie-metadata/versions/2/tmdb_5000_movies.csv')
print(movies.shape)  # 데이터의 행과 열 확인
movies.head(1)  # 데이터 첫 번째 행 확인

# 필요한 열만 선택하여 새로운 데이터프레임 생성
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]

# genres와 keywords 데이터를 리스트로 변환
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)

# 리스트 내 딕셔너리에서 'name' 값만 추출
movies_df['genres'] = movies_df['genres'].apply(lambda x: [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [y['name'] for y in x])

# genres 데이터를 공백으로 구분된 문자열로 변환
from sklearn.feature_extraction.text import CountVectorizer
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))

# CountVectorizer를 이용해 벡터화
count_vect = CountVectorizer(min_df=0.0, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)  # 벡터화된 데이터의 크기 확인

# 코사인 유사도 계산
from sklearn.metrics.pairwise import cosine_similarity
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)  # 유사도 행렬의 크기 확인
print(genre_sim[:1])  # 첫 번째 영화와 다른 영화 간 유사도 확인

# 유사도 행렬을 내림차순으로 정렬
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])  # 첫 번째 영화와 유사한 영화 인덱스 확인

# 첫 번째 영화와 유사한 상위 5개 영화의 인덱스 확인
similar_movies = genre_sim_sorted_ind[0, :5]
print("첫 번째 영화와 유사한 영화들:", similar_movies)

# 유사 영화를 검색하는 함수 정의
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
    # 영화 제목으로 데이터프레임에서 해당 영화 검색
    title_movie = df[df['title'] == title_name]
    # 해당 영화의 인덱스를 가져오고, 유사도 행렬에서 상위 top_n 인덱스 추출
    title_index = title_movie.index
    similar_indexes = sorted_ind[title_index, :(top_n)]
    # 2차원 배열로 반환된 결과를 1차원으로 변환
    similar_indexes = similar_indexes.reshape(-1)
    # 유사한 영화 정보 반환
    return df.iloc[similar_indexes]

# 'The Godfather'와 유사한 상위 10개 영화 검색
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)

# 유사 영화의 제목과 평점 출력
print(similar_movies[['title', 'vote_average']])

# Nearest Neighbor Collaborative-filtering
- 사용자 기반
- 아이템 기반

## Item Based
- 사용자 기반보다 추천 정확도가 더 뛰어남
- 사용자-영화 평점 행렬 데이터 세이트가 필요함

### Data import
- Grouplens/Movielens

In [9]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
movies = pd.read_csv('./ml-latest-small/movies.csv')

In [10]:
# 필요한 평점 데이터만 취득
ratings = ratings[['userId','movieId','rating']]
# 평점 데이터에서는 영화 이름을 볼 수 없음
# ratings_matrix = ratings.pivot_table('rating',index='userId',columns='movieId')

# 영화정보 테이블과 병합
ratings_movies = pd.merge(ratings,movies, on = 'movieId')
# 사용자(row)-영화(col) 행렬(값은 평점으로)
ratings_matrix = ratings_movies.pivot_table('rating',index='userId',columns='title')
# 평점의 최소는  0.5이기 때문에 Nan값은 0으로 채기기
ratings_matrix.fillna(0,inplace=True)
ratings_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 지금 상태에서 코사인 유사도를 구하면 사용자간 유사도를 구하게 됨
- 행렬을 바꿔야함 -> "전치행렬" 이라고 부름
- pd.DataFrame.transpose() or pd.DataFrame.T

In [15]:
ratings_matrix_T = ratings_matrix.T

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T,ratings_matrix_T)

# cosine_similarity()로 반환된 numpy 행렬에 영화명을 매핑하여 DataFrame으로 변환
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)
print(item_sim_df.shape)

display(item_sim_df.head(3))

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
item_sim_df["Godfather, The (1972)"].sort_values(ascending=False)[:6]

title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64

In [25]:
item_sim_df["Inception (2010)"].sort_values(ascending=False)[1:6]

title
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Name: Inception (2010), dtype: float64

#### 영화 기반으로 가장 유사도가 높은 영화를 골라줌

## 아이템 기반 최근접 이웃 협업 필터링으로 개인화된 영화 추천

$$ \hat R_{u,i} = \frac{\sum N(S_{i,N} \times R_{u,N})}{\sum N (|S_{i,N}|)}$$

- $ \hat R_{u,i} $ : 사용자 u, 아이템 i 의 개인화된 예측 평점 값
- $ S_{i,N} $ : 아이템 i와 가장 유사도가 높은 Top-N개 아이템의 유사도 벡터
- $ R_{u,N} $ : 사용자 u의 아이템 i와 가장 유사도가 높은 Top-N개 아이템에 대한 실제 평점 벡터
- $ N $ : 아이템의 최근접 이웃 범위 계수(item neighbor)를 의미함, 특정 아이템과 유사도가 가장 높은 Top-N개의 다른 아이템을 추출하는데 사용됨

### 영화간 유사도(item_sim_df)와 사용자-영화 평점(ratings_matrix)
- 사용자별로 최적화된 평점 스코어를 예측하는 함수 predict_rating()
- ratings_matrix와 item_sim_df를 numpy matrix로 변환하여 시행
- N의 범위에 제약을 두지 않는다면 사용자별 영화 예측 평점($ \hat R_{u,i} $)은 <br/> 사용자 u의 모든 영화에 대한 실제 평점과 영화 i의 다른 모든 영화와의
  코사인 유사도를 벡터 내적 곱(dot)한 값을 <br/> 정규화를 위해 $ \sum N (|S_{i,N}|) $로 나눈것을 의미함

In [26]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr)/ np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [28]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501


- 예측평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적(dot)한 값이기 때문에 기존에 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생한다.
- 예측 평점이 실제 평점에 비해 작을수도 있다. 이는 내적 결과를 코사인 유사도 벡터합으로 나누었기 때문에 생기는 현상

예측 결과가 실제 평점과 얼마나 차이가 있는지 확인 (get_mse())
- 예측 평가지표는 MSE를 적용
- MSE 측정 시 평점을 주지 않은 경우는 0으로 부과했으나 개인화된 예측 점수는 평점을 주지 않은 영화에 대해서도 아이템 유사도에 기반해 평점을 예측함
- 실제와 예측 평점의 차이는 기존에 평점이 부여된 데이터에 대해서만 오차 정도를 측정

In [30]:
from sklearn.metrics import mean_squared_error

def get_mse(pred,actual):
    # 평점이 있는 실제 영화만 추출
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred,actual)

print('아이템 기반 모든 최근접 이웃 MSE : ', get_mse(ratings_pred, ratings_matrix.values))

아이템 기반 모든 최근접 이웃 MSE :  9.895354759094706


- 앞선 predict_rating() 함수는 사용자별 영화의 예측 평점을 계산하기 위해 해당 영화와 다른 모든 영화간의 유사도 벡터를 적용한 것
- 많은 영화의 유사도 벡터를 이용하다 보니 상대적으로 평점 예측이 떨어짐
- 특정 영화와 가장 비슷한 유사도를 가지는 영화에 대해서만 유사도 벡터를 적용하는 함수로 변경

In [38]:
def predict_rating_topsim(ratings_arr,item_sim_arr,n=20):
    #사용자-아이템 평점 행렬 크기만큼 0으로 채욱 예측행렬 초기화
    pred = np.zeros(ratings_arr.shape)
    #사용자-아이템 평점 행렬의 열 크기 만큼 루프 수행
    for col in tqdm(range(ratings_arr.shape[1])):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:,col])[:n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row,col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row,col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
    return pred

- 이전 함수와 달리 N 인자를 가지고 있어 TOP-N 유사도를 가지는 영화 유사도 벡터만 예측값을 계산하는데 적용함

In [37]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)
print('아이템 기반 최근접 TOP-20 이웃 MSE : ', get_mse(ratings_pred,ratings_matrix.values))

#계산된 예측 평점 데이터는 DataFrame으로 재생성
rating_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns )

100%|██████████| 9719/9719 [04:03<00:00, 39.90it/s]

아이템 기반 최근접 TOP-20 이웃 MSE :  9.895354759094706





In [48]:
ratings_matrix.loc[9,:]

title
'71 (2014)                                   0.0
'Hellboy': The Seeds of Creation (2004)      0.0
'Round Midnight (1986)                       0.0
'Salem's Lot (2004)                          0.0
'Til There Was You (1997)                    0.0
                                            ... 
eXistenZ (1999)                              0.0
xXx (2002)                                   1.0
xXx: State of the Union (2005)               0.0
¡Three Amigos! (1986)                        0.0
À nous la liberté (Freedom for Us) (1931)    0.0
Name: 9, Length: 9719, dtype: float64

In [55]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

movies = pd.read_csv('./ml-latest-small/movies.csv')
ratings = pd.read_csv('./ml-latest-small/ratings.csv')

# Merging ratings and movies DataFrame
ratings_movies = pd.merge(ratings, movies, on="movieId", how="inner")

# Creating the user-item ratings matrix
ratings_matrix = ratings_movies.pivot_table('rating', index='userId', columns='title', fill_value=0)

# Transpose the matrix to calculate item-item similarity
ratings_matrix_T = ratings_matrix.T

# Calculate cosine similarity for items
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)

# Function to predict ratings using the entire similarity matrix
def predict_rating(ratings_arr, item_sim_arr):
    return ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])

# Predict ratings using all neighbors
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)

# Function to calculate mean squared error (MSE)
def get_mse(pred, actual):
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

# Calculate MSE for all neighbors
mse_all_neighbors = get_mse(ratings_pred, ratings_matrix.values)

# Function to predict ratings using Top-N similar items
def predict_rating_topsim(ratings_arr, item_sim_arr, n=2):  # Using small n due to small dataset size
    pred = np.zeros(ratings_arr.shape)
    for col in tqdm(range(ratings_arr.shape[1])):
        # Get Top-N similar items for the current item
        top_n_items = [np.argsort(item_sim_arr[:, col])[:n-1:-1]]
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
    return pred

# Predict ratings using Top-2 similar items
ratings_pred_topsim = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)

# Calculate MSE for Top-2 neighbors
mse_top_2_neighbors = get_mse(ratings_pred_topsim, ratings_matrix.values)

# Results
print("MSE using all neighbors:", mse_all_neighbors)
print("MSE using Top-2 neighbors:", mse_top_2_neighbors)


100%|██████████| 9719/9719 [04:01<00:00, 40.20it/s]

MSE using all neighbors: 9.895354759094706
MSE using Top-2 neighbors: 9.895354759094706





In [60]:
ratings_matrix.shape

(610, 9719)

In [61]:
item_sim_df[]

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141653,0.000000,...,0.000000,0.342055,0.543305,0.707107,0.0,0.000000,0.139431,0.327327,0.000000,0.0
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.707107,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Round Midnight (1986),0.000000,0.707107,1.000000,0.000000,0.000000,0.0,0.176777,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Salem's Lot (2004),0.000000,0.000000,0.000000,1.000000,0.857493,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Til There Was You (1997),0.000000,0.000000,0.000000,0.857493,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211467,0.216295,0.097935,0.132489,...,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.192259,0.000000,0.170341,0.0
xXx (2002),0.139431,0.000000,0.000000,0.000000,0.000000,0.0,0.089634,0.000000,0.276512,0.019862,...,0.069716,0.305535,0.173151,0.246482,0.0,0.192259,1.000000,0.270034,0.100396,0.0
xXx: State of the Union (2005),0.327327,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.156764,0.000000,...,0.000000,0.382543,0.177838,0.231455,0.0,0.000000,0.270034,1.000000,0.000000,0.0
¡Three Amigos! (1986),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.372876,0.180009,0.169385,0.249586,...,0.180009,0.000000,0.000000,0.000000,0.0,0.170341,0.100396,0.000000,1.000000,0.0
