In [1]:
import pandas as pd
from datetime import datetime
from scipy.sparse import coo_matrix, hstack
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from lightfm import LightFM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ratings = pd.read_json('ratings.jsonl', lines=True).drop(columns=['Timestamp'])
ratings.rename(columns={col: col[0].lower() + col[1:] for col in ratings.columns}, inplace=True)
content = pd.read_json('content.jsonl', lines=True)
content.rename(columns={col: col[0].lower() + col[1:] for col in content.columns}, inplace=True)
targets = pd.read_csv('targets.csv')
targets.rename(columns={col: col[0].lower() + col[1:] for col in targets.columns}, inplace=True)

print("Ratings columns: ", ratings.columns)
print("Ratings size: ", len(ratings))
print("Content columns: ", content.columns)
print("Content size: ", len(content))

Ratings columns:  Index(['userId', 'itemId', 'rating'], dtype='object')
Ratings size:  659720
Content columns:  Index(['itemId', 'title', 'year', 'rated', 'released', 'runtime', 'genre',
       'director', 'writer', 'actors', 'plot', 'language', 'country', 'awards',
       'poster', 'ratings', 'metascore', 'imdbRating', 'imdbVotes', 'type',
       'dVD', 'boxOffice', 'production', 'website', 'response', 'totalSeasons',
       'season', 'episode', 'seriesID'],
      dtype='object')
Content size:  38012


In [3]:
unique_users = ratings['userId'].unique()
unique_items = content['itemId'].unique()

n_usr = len(unique_users)
n_itm = len(unique_items)

user_indexes = {user: i for i, user in enumerate(unique_users)}
item_indexes = {item: i for i, item in enumerate(unique_items)}

user_reverse_indexes = {idx: user for user, idx in user_indexes.items()} 
item_reverse_indexes = {idx: item for item, idx in item_indexes.items()}

ratings['userId'] = ratings['userId'].apply(lambda x: user_indexes[x])
ratings['itemId'] = ratings['itemId'].apply(lambda x: item_indexes[x])
content['itemId'] = content['itemId'].apply(lambda x: item_indexes[x])
targets['userId'] = targets['userId'].apply(lambda x: user_indexes[x])
targets['itemId'] = targets['itemId'].apply(lambda x: item_indexes[x])

In [4]:
interaction_matrix = coo_matrix(
    (ratings['rating'], (ratings['userId'], ratings['itemId']))
)

In [5]:
content['title'] = content['title'].replace('N/A', None)
content['genre'] = content['genre'].replace('N/A', None)
content['director'] = content['director'].replace('N/A', None)
content['writer'] = content['writer'].replace('N/A', None)
content['actors'] = content['actors'].replace('N/A', None)
content['plot'] = content['plot'].replace('N/A', None)
content['awards'] = content['awards'].replace('N/A', None)

def combination(row):
    template = ""

    if pd.notna(row['title']):
        template += f'{row["title"]} '
    if pd.notna(row['genre']):
        template += f'{row["genre"]} '
    if pd.notna(row['director']):
        template += f'{row["director"]} '
    if pd.notna(row['writer']):
        template += f'{row["writer"]} '
    if pd.notna(row['actors']):
        template += f'{row["actors"]} '
    if pd.notna(row['plot']):
        template += f'{row["plot"]} '
    if pd.notna(row['awards']):
        template += f'{row["awards"]} '
    
    return template

vectorizer = TfidfVectorizer(max_features=1000)
text_features = vectorizer.fit_transform(content.apply(combination, axis=1))

In [6]:
content['imdbRating'] = pd.to_numeric(content['imdbRating'], errors='coerce')
content['imdbRating'] = content['imdbRating'].fillna(content['imdbRating'].median())
content['imdbVotes'] = pd.to_numeric(content['imdbVotes'], errors='coerce')
content['imdbVotes'] = content['imdbVotes'].fillna(0)

scaler = MinMaxScaler()
content[['imdbRating', 'imdbVotes']] = scaler.fit_transform(content[['imdbRating', 'imdbVotes']])
num_features = coo_matrix(content[['imdbRating', 'imdbVotes']].values)

In [7]:
combined_features = hstack([text_features, num_features])
item_features_matrix = coo_matrix(combined_features)

In [8]:
model = LightFM(loss='warp', no_components=50)

model.fit(
    interaction_matrix,
    item_features=item_features_matrix,
    epochs=30,
    num_threads=4,
)

<lightfm.lightfm.LightFM at 0x3335b6ef0>

In [9]:
targets['Score'] = model.predict(targets['userId'].to_numpy(), targets['itemId'].to_numpy(), item_features=item_features_matrix)

In [10]:
targets['UserId'] = targets['userId'].apply(lambda x: user_reverse_indexes[x])
targets['ItemId'] = targets['itemId'].apply(lambda x: item_reverse_indexes[x])
targets.sort_values(by=['UserId', 'Score'], ascending=[True, False], inplace=True)
targets.to_csv(f'./submissions/lightfm-cb-submission_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv', index=False, columns=['UserId', 'ItemId'])