## Contents-based filtering
* TF-IDF + cosine similarity

In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = "./movielens/movielens100k/"
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), index_col = 'userId', encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col = 'movieId', encoding='utf-8')

In [3]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [4]:
total_genres

['Romance',
 'War',
 'Crime',
 'IMAX',
 'Film-Noir',
 'Musical',
 'Horror',
 'Comedy',
 'Drama',
 'Action',
 'Fantasy',
 '(no genres listed)',
 'Western',
 'Documentary',
 'Sci-Fi',
 'Adventure',
 'Mystery',
 'Animation',
 'Thriller',
 'Children']

In [5]:
genre_count = dict.fromkeys(total_genres)

for each_genre_list in movies_df['genres']:
    for genre in each_genre_list.split('|'):
        if genre_count[genre] == None:
            genre_count[genre] = 1
        else:
            genre_count[genre] = genre_count[genre]+1

In [6]:
# create genre representations
genre_representation = pd.DataFrame(columns=sorted(total_genres), index=movies_df.index)
for index, each_row in tqdm(movies_df.iterrows()):
    dict_temp = {i: genre_count[i] for i in each_row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[index])
    genre_representation.update(row_to_add)

genre_representation

9125it [00:07, 1213.11it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,1117.0,447.0,583.0,3315.0,,,,654.0,,,,,,,,,,
2,,,1117.0,,583.0,,,,,654.0,,,,,,,,,,
3,,,,,,3315.0,,,,,,,,,,1545.0,,,,
4,,,,,,3315.0,,,4365.0,,,,,,,1545.0,,,,
5,,,,,,3315.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162672,,,1117.0,,,,,,4365.0,,,,,,,1545.0,,,,
163056,,1545.0,1117.0,,,,,,,654.0,,,,,,,792.0,,,
163949,,,,,,,,495.0,,,,,,,,,,,,
164977,,,,,,3315.0,,,,,,,,,,,,,,


In [7]:
# get unique tag
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['dull story', 'sword fight', 'plot holes', 'not by book', 'holes80s', 'gritty', 'dance', 'franchise', 'childish plot', 'alex pettyfer', 'complicated', 'hugh grant', 'aviation', 'sufficiently explodey to be good', 'dark humor', 'artificial intelligence', 'interesting concept - bad execution', 'guy movie', 'Nudity (Topless)', 'Brad Pitt', 'cooking', 'not too thrilled', 'horror', 'ocean', 'billy bob thorton', 'violent', 'christmas', 'toplist16', 'Takashi Miike', 'long', 'photography', 'unique look', 'vengeance', 'joseph fiennes', 'toplist13', 'Spanked', 'tarantino', 'genius', 'science fiction', 'jodi foster', 'must see', 'travel', 'cheerleading', 'toplist15', 'intelligent', 'guy ritchie', 'funny', 'sci-fi', 'devotion', 'Guillermo del Toro', 'I loved it! Seen it five times already!', 'getdvd', 'sexist', 'denzel washington', 'books', 'predictable', 'freedom', 'based on a true story', 'Studio Ghibli', 'sentimental', 'covers a lifespan', 'Zooey Deschanel', 'road trip', 'renegade art', 'dvd',

In [8]:
# Compute IDF for tag
total_movie_count = len(set(tags_df['movieId']))
# key: tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'dull story': 2.8382192219076257,
 'sword fight': 2.8382192219076257,
 'plot holes': 2.8382192219076257,
 'not by book': 2.8382192219076257,
 'holes80s': 1.6920911862293877,
 'gritty': 2.8382192219076257,
 'dance': 2.5371892262436444,
 'franchise': 2.8382192219076257,
 'childish plot': 2.8382192219076257,
 'alex pettyfer': 2.8382192219076257,
 'complicated': 2.8382192219076257,
 'hugh grant': 2.8382192219076257,
 'aviation': 2.8382192219076257,
 'sufficiently explodey to be good': 2.8382192219076257,
 'dark humor': 2.5371892262436444,
 'artificial intelligence': 2.8382192219076257,
 'interesting concept - bad execution': 2.8382192219076257,
 'guy movie': 2.8382192219076257,
 'Nudity (Topless)': 1.993121181893369,
 'Brad Pitt': 2.139249217571607,
 'cooking': 2.8382192219076257,
 'not too thrilled': 2.8382192219076257,
 'horror': 2.3610979671879635,
 'ocean': 2.5371892262436444,
 'billy bob thorton': 2.8382192219076257,
 'violent': 2.3610979671879635,
 'christmas': 2.8382192219076257,
 

In [9]:
# Create movie representations
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index()
tag_representation

100%|██████████| 689/689 [00:00<00:00, 2482.49it/s]


Unnamed: 0,1940's feel,80's classic,Abigail Breslin,Adam Sandler,Aging,Aging Disorder,Alex van Warmerdam,Alfred Hitchcock,Amanda Peet,Ang Lee,...,war,weird,well done,whimsical,who done it,will farell,witty,witty!,wrongful imprisonment,zombies
1,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
47,,,,,,,,,,,...,,,,,,,,,,
50,,,,,,,,,,,...,,,,,,,,,,
104,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161582,,,,,,,,,,,...,,,,,,,,,,
163056,,,,,,,,,,,...,,,,,,,,,,
163949,,,,,,,,,,,...,,,,,,,,,,
164977,,,,,,,,,,,...,,,,,,,,,,


In [10]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
print(movie_representation.describe())

(9125, 606)
       (no genres listed)       Action    Adventure    Animation     Children   
count         9125.000000  9125.000000  9125.000000  9125.000000  9125.000000  \
mean             0.035507   261.591781   136.733041    21.896877    37.248110   
std              0.798707   579.453051   366.127809    96.485496   142.584902   
min              0.000000     0.000000     0.000000     0.000000     0.000000   
25%              0.000000     0.000000     0.000000     0.000000     0.000000   
50%              0.000000     0.000000     0.000000     0.000000     0.000000   
75%              0.000000     0.000000     0.000000     0.000000     0.000000   
max             18.000000  1545.000000  1117.000000   447.000000   583.000000   

            Comedy        Crime  Documentary        Drama      Fantasy  ...   
count  9125.000000  9125.000000  9125.000000  9125.000000  9125.000000  ...  \
mean   1204.298630   132.602740    26.852055  2088.024658    46.872986  ...   
std    1594.425716   

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [12]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9115,9116,9117,9118,9119,9120,9121,9122,9123,9124
1,1.0,0.390671,0.826878,0.531054,0.912274,0.0,0.826878,0.346744,0.0,0.133408,...,0.0,0.153079,0.0,0.0,0.0,0.072086,0.212907,0.0,0.912274,0.0
2,0.390671,1.0,0.0,0.0,0.0,0.0,0.0,0.887561,0.0,0.341485,...,0.0,0.391837,0.0,0.0,0.0,0.184518,0.544977,0.0,0.0,0.0
3,0.826878,0.0,1.0,0.64224,0.906392,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.281476,0.137023,0.0,0.0,0.906393,0.0
4,0.531054,0.0,0.64224,1.0,0.582121,0.0,0.64224,0.0,0.0,0.0,...,0.700517,0.0,0.766504,0.766504,0.180775,0.790428,0.0,0.0,0.582121,0.0
5,0.912274,0.0,0.906392,0.582121,1.0,0.0,0.906392,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
print(movies_df.loc[1])
print(movies_df.loc[46972])
print(movies_df.loc[2043])
print(movies_df.loc[2399])

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object
title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object
title     Darby O'Gill and the Little People (1959)
genres                   Adventure|Children|Fantasy
Name: 2043, dtype: object
title     Santa Claus: The Movie (1985)
genres       Adventure|Children|Fantasy
Name: 2399, dtype: object


In [14]:
# split train and test set
train_df, test_df = train_test_split(ratings_df.reset_index(), test_size=.2, random_state=123)
print(train_df.shape)
print(test_df.shape)

(80003, 4)
(20001, 4)


In [15]:
test_userids = list(set(test_df.userId.values))

In [16]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]  
    user_rating_df = user_record_df[['rating']]  
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1) 
    # print("user_id=", user_id, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) 

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)



100%|██████████| 671/671 [00:03<00:00, 202.18it/s]


In [17]:
result_df.head()

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,1061,1.870937,1,3.0,1260759182
1,1339,1.675645,1,3.5,1260759125
2,2105,1.802382,1,4.0,1260759139
0,47,2.914872,2,4.0,835355552
1,50,2.96899,2,4.0,835355586


In [18]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

0.9776362023305245 0.9887548747442535
