In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('darkgrid')

In [86]:
data_dir = Path('../data/ml-100k')


ratings = pd.read_csv(data_dir / 'u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [87]:
movies = pd.read_csv(data_dir/'u.item', sep='|', encoding='ISO-8859-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [88]:
users = pd.read_csv(data_dir/'u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [89]:
data = pd.merge(ratings, users, on='user_id')
data = pd.merge(data, movies[['movie_id', 'movie_title']], on='movie_id')

data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,movie_title
0,196,242,3,1997-12-04 15:55:49,49,M,writer,55105,Kolya (1996)
1,186,302,3,1998-04-04 19:22:22,39,F,executive,0,L.A. Confidential (1997)
2,22,377,1,1997-11-07 07:18:36,25,M,writer,40206,Heavyweights (1994)
3,244,51,2,1997-11-27 05:02:03,28,M,technician,80525,Legends of the Fall (1994)
4,166,346,1,1998-02-02 05:33:16,47,M,educator,55113,Jackie Brown (1997)


In [90]:
from surprise import  Dataset, Reader, SVD, accuracy
from surprise.model_selection import  train_test_split, cross_validate

reader = Reader()
dataset = Dataset.load_from_df(data[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(dataset, test_size=0.2)

# Learn SVD
model = SVD(n_epochs=30)
model.fit(trainset)

# Score on test
predictions = model.test(testset)

# RMSE
accuracy.rmse(predictions)


RMSE: 0.9375


0.9375348014428726

In [91]:
def get_recommendations(user_id, model, data, num_recommendations=5):
    # Watched movie list for user
    watched = data[data['user_id'] == user_id]['movie_id'].tolist()

    # Marking all movies in stock and show what movies haven't watched by user 
    all_movies = data['movie_id'].unique()
    movies_predict = [movie for movie in all_movies if movie not in watched]

    # Rating predict for movies
    predictions = [model.predict(user_id, movie_id) for movie_id in movies_predict]

    # Sorting by rating
    top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
    
    for recommendation in top_recommendations:
        movie_title = movies[movies['movie_id'] == recommendation.iid]['movie_title'].values[0]
        print(f"Movie: {movie_title}, Predicted Rating: {recommendation.est:.2f}")

# Example for user 1
get_recommendations(user_id=1, model=model, data=data)


Movie: Schindler's List (1993), Predicted Rating: 5.00
Movie: Lawrence of Arabia (1962), Predicted Rating: 5.00
Movie: Third Man, The (1949), Predicted Rating: 5.00
Movie: Harold and Maude (1971), Predicted Rating: 5.00
Movie: Man Who Would Be King, The (1975), Predicted Rating: 5.00


In [92]:
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# Prepropcessing data
user_ids = ratings['user_id'].astype('category').cat.codes
item_ids = ratings['movie_id'].astype('category').cat.codes
 
interactions = coo_matrix((ratings['rating'], (user_ids, item_ids)))

# Movie features (genre)
item_features = movies.set_index('movie_id')[['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
                                              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
                                              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].astype(float)

# Fit model
model = LightFM(loss='warp', item_alpha=1e-6, user_alpha=1e-6)
model.fit(interactions, epochs=50, num_threads=4)

# Predict model
train_precision = precision_at_k(model, interactions, k=5).mean()
print(f'Precision@k: {train_precision}')

user_id = 1
n_items = interactions.shape[1]

print(f"User ID: {user_id}, Number of items: {n_items}")

# Predict all movie for user_id
try:
    scores = model.predict(user_id, np.arange(n_items))
    print(f"Scores: {scores}")
    
    # Top 10 recommendation
    top_items = np.argsort(-scores)[:10]
    print(f"Recommended items for user {user_id}: {top_items}")
except Exception as e:
    print(f"Error: {e}")


Precision@k: 0.8214209675788879
User ID: 1, Number of items: 1682
Scores: [-1.4127393 -5.9268    -3.530757  ... -4.783074  -5.3314285 -5.266567 ]
Recommended items for user 1: [285 268  99 284 257 275 301  13 288 318]
