In [1]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

print("MPS available: ", torch.backends.mps.is_available())

  from .autonotebook import tqdm as notebook_tqdm


MPS available:  True


In [2]:
ratings = pd.read_json('ratings.jsonl', lines=True).drop(columns=['Timestamp'])
ratings.rename(columns={col: col[0].lower() + col[1:] for col in ratings.columns}, inplace=True)
content = pd.read_json('content.jsonl', lines=True)
content.rename(columns={col: col[0].lower() + col[1:] for col in content.columns}, inplace=True)
targets = pd.read_csv('targets.csv')
targets.rename(columns={col: col[0].lower() + col[1:] for col in targets.columns}, inplace=True)

print("Ratings columns: ", ratings.columns)
print("Ratings size: ", len(ratings))
print("Content columns: ", content.columns)
print("Content size: ", len(content))

Ratings columns:  Index(['userId', 'itemId', 'rating'], dtype='object')
Ratings size:  659720
Content columns:  Index(['itemId', 'title', 'year', 'rated', 'released', 'runtime', 'genre',
       'director', 'writer', 'actors', 'plot', 'language', 'country', 'awards',
       'poster', 'ratings', 'metascore', 'imdbRating', 'imdbVotes', 'type',
       'dVD', 'boxOffice', 'production', 'website', 'response', 'totalSeasons',
       'season', 'episode', 'seriesID'],
      dtype='object')
Content size:  38012


In [3]:
content =  content.drop(columns=[
    'year', 
    'rated', 
    'released', 
    'runtime', 
    'language', 
    'country', 
    'poster', 
    'ratings', 
    'metascore',
    'type', 
    'dVD', 
    'boxOffice', 
    'production', 
    'website', 
    'response', 
    'totalSeasons', 
    'season', 
    'episode', 
    'seriesID',
    'imdbVotes'
])
content

Unnamed: 0,itemId,title,genre,director,writer,actors,plot,awards,imdbRating
0,c9f0f895fb,Edison Kinetoscopic Record of a Sneeze,"Documentary, Short",William K.L. Dickson,,Fred Ott,A man (Edison's assistant) takes a pinch of sn...,1 win,5.5
1,d3d9446802,Leaving the Factory,"Documentary, Short",Louis Lumière,,,A man opens the big gates to the Lumière facto...,,6.9
2,c20ad4d76f,The Arrival of a Train,"Documentary, Short","Auguste Lumière, Louis Lumière",,"Madeleine Koehler, Marcel Koehler, Mrs. August...",A group of people are standing in a straight l...,,7.5
3,8e296a067a,The Oxford and Cambridge University Boat Race,"Short, News, Sport",Birt Acres,,,Although the content of this film is primitive...,,4.2
4,54229abfcf,The House of the Devil,"Short, Horror",Georges Méliès,Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",A bat flies into an ancient castle and transfo...,,6.7
...,...,...,...,...,...,...,...,...,...
38007,6c0ffc79d0,Yara,"Crime, Drama, Thriller",Marco Tullio Giordana,"Graziano Diana, Giacomo Martelli","Isabella Ragonese, Alessio Boni, Thomas Trabacchi",A determined prosecutor becomes consumed with ...,,6.2
38008,e02f371f8c,Lords of Scam,"Documentary, Crime",Guillaume Nicloux,Olivier Bouchara,,This documentary traces the rise and crash of ...,,6.3
38009,8c2a2a22b8,Cash,"Comedy, Drama",Rishab Seth,"Vishesh Bhatt, Rishab Seth, Aarsh Vora","Amol Parashar, Smiriti Kalra, Gulshan Grover",The government announces demonetization. The s...,,7.2
38010,ae74ba6bb7,Sompoy,"Comedy, Romance","Anawat Phromchae, Aroonakorn Pick","Anawat Phromchae, Aroonakorn Pick","Pijakkana Wongsarattanasin, Tanapol Jarujittra...","A love triangle story of a young woman named ""...",,


In [4]:
content['imdbRating'] = pd.to_numeric(content['imdbRating'], errors='coerce')
print("imdbRating NaN count: ", content['imdbRating'].isna().sum())
content['imdbRating'] = content['imdbRating'].fillna(content['imdbRating'].mean())

content['title'] = content['title'].replace('N/A', None)
content['genre'] = content['genre'].replace('N/A', None)
content['director'] = content['director'].replace('N/A', None)
content['writer'] = content['writer'].replace('N/A', None)
content['actors'] = content['actors'].replace('N/A', None)
content['plot'] = content['plot'].replace('N/A', None)
content['awards'] = content['awards'].replace('N/A', None)
content

imdbRating NaN count:  349


Unnamed: 0,itemId,title,genre,director,writer,actors,plot,awards,imdbRating
0,c9f0f895fb,Edison Kinetoscopic Record of a Sneeze,"Documentary, Short",William K.L. Dickson,,Fred Ott,A man (Edison's assistant) takes a pinch of sn...,1 win,5.500000
1,d3d9446802,Leaving the Factory,"Documentary, Short",Louis Lumière,,,A man opens the big gates to the Lumière facto...,,6.900000
2,c20ad4d76f,The Arrival of a Train,"Documentary, Short","Auguste Lumière, Louis Lumière",,"Madeleine Koehler, Marcel Koehler, Mrs. August...",A group of people are standing in a straight l...,,7.500000
3,8e296a067a,The Oxford and Cambridge University Boat Race,"Short, News, Sport",Birt Acres,,,Although the content of this film is primitive...,,4.200000
4,54229abfcf,The House of the Devil,"Short, Horror",Georges Méliès,Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",A bat flies into an ancient castle and transfo...,,6.700000
...,...,...,...,...,...,...,...,...,...
38007,6c0ffc79d0,Yara,"Crime, Drama, Thriller",Marco Tullio Giordana,"Graziano Diana, Giacomo Martelli","Isabella Ragonese, Alessio Boni, Thomas Trabacchi",A determined prosecutor becomes consumed with ...,,6.200000
38008,e02f371f8c,Lords of Scam,"Documentary, Crime",Guillaume Nicloux,Olivier Bouchara,,This documentary traces the rise and crash of ...,,6.300000
38009,8c2a2a22b8,Cash,"Comedy, Drama",Rishab Seth,"Vishesh Bhatt, Rishab Seth, Aarsh Vora","Amol Parashar, Smiriti Kalra, Gulshan Grover",The government announces demonetization. The s...,,7.200000
38010,ae74ba6bb7,Sompoy,"Comedy, Romance","Anawat Phromchae, Aroonakorn Pick","Anawat Phromchae, Aroonakorn Pick","Pijakkana Wongsarattanasin, Tanapol Jarujittra...","A love triangle story of a young woman named ""...",,6.317643


In [5]:
unique_users = ratings['userId'].unique()
unique_items = content['itemId'].unique()
print("Unique users: ", len(unique_users), "Unique items: ", len(unique_items))

user_indexes = {user: i for i, user in enumerate(unique_users)}
item_indexes = {item: i for i, item in enumerate(unique_items)}

reverse_user_indexes = {user: i for i, user in user_indexes.items()}
reverse_item_indexes = {item: i for i, item in item_indexes.items()}

Unique users:  51671 Unique items:  38012


In [6]:
ratings['userId'] = ratings['userId'].apply(lambda x: user_indexes[x])
ratings['itemId'] = ratings['itemId'].apply(lambda x: item_indexes[x])
ratings

Unnamed: 0,userId,itemId,rating
0,0,9230,8
1,1,15690,9
2,1,18435,8
3,1,19072,1
4,2,15860,8
...,...,...,...
659715,51667,31634,7
659716,51667,32134,7
659717,51668,26184,4
659718,51669,18189,1


In [7]:
content['itemId'] = content['itemId'].apply(lambda x: item_indexes[x])
content

Unnamed: 0,itemId,title,genre,director,writer,actors,plot,awards,imdbRating
0,0,Edison Kinetoscopic Record of a Sneeze,"Documentary, Short",William K.L. Dickson,,Fred Ott,A man (Edison's assistant) takes a pinch of sn...,1 win,5.500000
1,1,Leaving the Factory,"Documentary, Short",Louis Lumière,,,A man opens the big gates to the Lumière facto...,,6.900000
2,2,The Arrival of a Train,"Documentary, Short","Auguste Lumière, Louis Lumière",,"Madeleine Koehler, Marcel Koehler, Mrs. August...",A group of people are standing in a straight l...,,7.500000
3,3,The Oxford and Cambridge University Boat Race,"Short, News, Sport",Birt Acres,,,Although the content of this film is primitive...,,4.200000
4,4,The House of the Devil,"Short, Horror",Georges Méliès,Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",A bat flies into an ancient castle and transfo...,,6.700000
...,...,...,...,...,...,...,...,...,...
38007,38007,Yara,"Crime, Drama, Thriller",Marco Tullio Giordana,"Graziano Diana, Giacomo Martelli","Isabella Ragonese, Alessio Boni, Thomas Trabacchi",A determined prosecutor becomes consumed with ...,,6.200000
38008,38008,Lords of Scam,"Documentary, Crime",Guillaume Nicloux,Olivier Bouchara,,This documentary traces the rise and crash of ...,,6.300000
38009,38009,Cash,"Comedy, Drama",Rishab Seth,"Vishesh Bhatt, Rishab Seth, Aarsh Vora","Amol Parashar, Smiriti Kalra, Gulshan Grover",The government announces demonetization. The s...,,7.200000
38010,38010,Sompoy,"Comedy, Romance","Anawat Phromchae, Aroonakorn Pick","Anawat Phromchae, Aroonakorn Pick","Pijakkana Wongsarattanasin, Tanapol Jarujittra...","A love triangle story of a young woman named ""...",,6.317643


In [8]:
targets['userId'] = targets['userId'].apply(lambda x: user_indexes[x])
targets['itemId'] = targets['itemId'].apply(lambda x: item_indexes[x])
targets = targets.astype({'userId': 'int32', 'itemId': 'int32'})
targets

Unnamed: 0,userId,itemId
0,31471,21530
1,31471,5449
2,31471,24104
3,31471,26480
4,31471,32061
...,...,...
616195,29367,35855
616196,29367,17530
616197,29367,1540
616198,29367,19908


In [9]:
def tfidf_cobination(row):
    template = ""

    if pd.notna(row['title']):
        template += f'{row["title"]} '
    if pd.notna(row['genre']):
        template += f'{row["genre"]} '
    if pd.notna(row['director']):
        template += f'{row["director"]} '
    if pd.notna(row['writer']):
        template += f'{row["writer"]} '
    if pd.notna(row['actors']):
        template += f'{row["actors"]} '
    if pd.notna(row['plot']):
        template += f'{row["plot"]} '
    if pd.notna(row['awards']):
        template += f'{row["awards"]} '
    
    return template

def embedding_combination(row):
    template = ''

    if pd.notna(row['title']):
        template += f'Title: {row["title"]}\n'
    if pd.notna(row['genre']):
        template += f'Genre: {row["genre"]}\n'
    if pd.notna(row['director']):
        template += f'Director: {row["director"]}\n'
    if pd.notna(row['writer']):
        template += f'Writer: {row["writer"]}\n'
    if pd.notna(row['actors']):
        template += f'Actors: {row["actors"]}\n'
    if pd.notna(row['plot']):
        template += f'Plot: {row["plot"]}\n'
    if pd.notna(row['awards']):
        template += f'Awards: {row["awards"]}\n'

    return template

content['tfidf_combination'] = content.apply(tfidf_cobination, axis=1)
content['embedding_combination'] = content.apply(embedding_combination, axis=1)
content = content[['itemId', 'tfidf_combination', 'embedding_combination', 'imdbRating']]
content

Unnamed: 0,itemId,tfidf_combination,embedding_combination,imdbRating
0,0,Edison Kinetoscopic Record of a Sneeze Documen...,Title: Edison Kinetoscopic Record of a Sneeze\...,5.500000
1,1,"Leaving the Factory Documentary, Short Louis L...",Title: Leaving the Factory\nGenre: Documentary...,6.900000
2,2,"The Arrival of a Train Documentary, Short Augu...",Title: The Arrival of a Train\nGenre: Document...,7.500000
3,3,The Oxford and Cambridge University Boat Race ...,Title: The Oxford and Cambridge University Boa...,4.200000
4,4,"The House of the Devil Short, Horror Georges M...","Title: The House of the Devil\nGenre: Short, H...",6.700000
...,...,...,...,...
38007,38007,"Yara Crime, Drama, Thriller Marco Tullio Giord...","Title: Yara\nGenre: Crime, Drama, Thriller\nDi...",6.200000
38008,38008,"Lords of Scam Documentary, Crime Guillaume Nic...","Title: Lords of Scam\nGenre: Documentary, Crim...",6.300000
38009,38009,"Cash Comedy, Drama Rishab Seth Vishesh Bhatt, ...","Title: Cash\nGenre: Comedy, Drama\nDirector: R...",7.200000
38010,38010,"Sompoy Comedy, Romance Anawat Phromchae, Aroon...","Title: Sompoy\nGenre: Comedy, Romance\nDirecto...",6.317643


In [10]:
# tfidf_matrix = None
# if os.path.exists('tfidf_matrix.pkl'):
#     with open('tfidf_matrix.pkl', 'rb') as f:
#         tfidf_matrix = pickle.load(f)
# else:
#     tfidf_vectorizer = TfidfVectorizer(stop_words='english')
#     tfidf_matrix = tfidf_vectorizer.fit_transform(content['tfidf_combination'])
#     with open('tfidf_matrix.pkl', 'wb') as f:
#         pickle.dump(tfidf_matrix, f)

# print(tfidf_matrix.shape)

In [11]:
embedding_matrix = None
if os.path.exists('embedding_matrix.pkl'):
    with open('embedding_matrix.pkl', 'rb') as f:
        embedding_matrix = pickle.load(f)
else:
    bert_vectorizer = SentenceTransformer('paraphrase-MiniLM-L3-v2')
    embedding_matrix = bert_vectorizer.encode(content['embedding_combination'], device='mps', show_progress_bar=True)
    with open('embedding_matrix.pkl', 'wb') as f:
        pickle.dump(embedding_matrix, f)

print(embedding_matrix.shape)

(38012, 384)


In [12]:
class RochioRecommender():
    def __init__(self, ratings, content, embedding_matrix):
        self.ratings = ratings
        self.content = content
        self.item_embeddings = embedding_matrix
        self.user_embeddings = None

    def fit(self):
        self.user_embeddings = np.zeros((len(unique_users), self.item_embeddings.shape[1]))
        for user_id in np.arange(len(unique_users)):
            user_ratings = self.ratings[self.ratings['userId'] == user_id]
            user_items = user_ratings['itemId'].values
            user_ratings = user_ratings['rating'].values
            user_embedding = np.zeros(self.item_embeddings.shape[1])
            for i, item_id in enumerate(user_items):
                user_embedding += self.item_embeddings[item_id] * user_ratings[i]
            user_embedding /= np.linalg.norm(user_embedding)
            self.user_embeddings[user_id] = user_embedding

    def predict(self, user_id, item_id):
        return 10 * cosine_similarity([self.user_embeddings[user_id]], [self.item_embeddings[item_id]])[0][0]

In [13]:
recommender = RochioRecommender(ratings, content, embedding_matrix)
recommender.fit()

targets['Score'] = targets.apply(lambda row: recommender.predict(row['userId'], row['itemId']), axis=1)

targets

  user_embedding /= np.linalg.norm(user_embedding)


Unnamed: 0,userId,itemId,Score
0,31471,21530,4.928200
1,31471,5449,6.162136
2,31471,24104,5.623519
3,31471,26480,5.024976
4,31471,32061,5.070276
...,...,...,...
616195,29367,35855,4.789409
616196,29367,17530,5.000597
616197,29367,1540,5.966721
616198,29367,19908,4.404861


In [15]:
submission = targets.rename(columns={'userId': 'UserId', 'itemId': 'ItemId'})

submission['UserId'] = submission['UserId'].apply(lambda x: reverse_user_indexes[x])
submission['ItemId'] = submission['ItemId'].apply(lambda x: reverse_item_indexes[x])

submission.sort_values(by=['UserId', 'Score'], ascending=[True, False])[['UserId', 'ItemId']].to_csv(f'./submissions/rocchio-submission_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False)