In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_style('darkgrid')

In [2]:
data_dir = Path('../data/ml-100k')


ratings = pd.read_csv(data_dir / 'u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [3]:
movies = pd.read_csv(data_dir/'u.item', sep='|', encoding='ISO-8859-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 
    'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
    'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
users = pd.read_csv(data_dir/'u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [5]:
data = pd.merge(ratings, users, on='user_id')
data = pd.merge(data, movies[['movie_id', 'movie_title']], on='movie_id')

data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,movie_title
0,196,242,3,1997-12-04 15:55:49,49,M,writer,55105,Kolya (1996)
1,186,302,3,1998-04-04 19:22:22,39,F,executive,0,L.A. Confidential (1997)
2,22,377,1,1997-11-07 07:18:36,25,M,writer,40206,Heavyweights (1994)
3,244,51,2,1997-11-27 05:02:03,28,M,technician,80525,Legends of the Fall (1994)
4,166,346,1,1998-02-02 05:33:16,47,M,educator,55113,Jackie Brown (1997)


In [6]:
from surprise import  Dataset, Reader, SVD, accuracy
from surprise.model_selection import  train_test_split, cross_validate

reader = Reader()
dataset = Dataset.load_from_df(data[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(dataset, test_size=0.2)

# Learn SVD
model = SVD(n_epochs=30)
model.fit(trainset)

# Score on test
predictions = model.test(testset)

# RMSE
accuracy.rmse(predictions)


RMSE: 0.9390


0.9390342114030306

In [7]:
def get_recommendations(user_id, model, data, num_recommendations=5):
    # Watched movie list for user
    watched = data[data['user_id'] == user_id]['movie_id'].tolist()

    # Marking all movies in stock and show what movies haven't watched by user 
    all_movies = data['movie_id'].unique()
    movies_predict = [movie for movie in all_movies if movie not in watched]

    # Rating predict for movies
    predictions = [model.predict(user_id, movie_id) for movie_id in movies_predict]

    # Sorting by rating
    top_recommendations = sorted(predictions, key=lambda x: x.est, reverse=True)[:num_recommendations]
    
    for recommendation in top_recommendations:
        movie_title = movies[movies['movie_id'] == recommendation.iid]['movie_title'].values[0]
        print(f"Movie: {movie_title}, Predicted Rating: {recommendation.est:.2f}")

# Example for user 1
get_recommendations(user_id=1, model=model, data=data)


Movie: Private Parts (1997), Predicted Rating: 5.00
Movie: One Flew Over the Cuckoo's Nest (1975), Predicted Rating: 4.99
Movie: Secrets & Lies (1996), Predicted Rating: 4.93
Movie: Charade (1963), Predicted Rating: 4.91
Movie: Night on Earth (1991), Predicted Rating: 4.86


In [8]:
users['age_bin'] = pd.qcut(users['age'], 4)
users['age_bin'] = pd.cut(users['age'], 
                bins=[0,25,30,45,np.inf], 
                labels=['<= 25', '26 - 30', '31 - 45', '>= 45'])

users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,age_bin
0,1,24,M,technician,85711,<= 25
1,2,53,F,other,94043,>= 45
2,3,23,M,writer,32067,<= 25
3,4,24,M,technician,43537,<= 25
4,5,33,F,other,15213,31 - 45


In [9]:
ratings['liked'] = np.where(ratings['rating'] >=3, 1, 0)
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp,liked
0,196,242,3,1997-12-04 15:55:49,1
1,186,302,3,1998-04-04 19:22:22,1
2,22,377,1,1997-11-07 07:18:36,0
3,244,51,2,1997-11-27 05:02:03,0
4,166,346,1,1998-02-02 05:33:16,0
...,...,...,...,...,...
99995,880,476,3,1997-11-22 05:10:44,1
99996,716,204,5,1997-11-17 19:39:03,1
99997,276,1090,1,1997-09-20 22:49:55,0
99998,13,225,2,1997-12-17 22:52:36,0


In [10]:
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code,age_bin
0,1,24,M,technician,85711,<= 25
1,2,53,F,other,94043,>= 45
2,3,23,M,writer,32067,<= 25
3,4,24,M,technician,43537,<= 25
4,5,33,F,other,15213,31 - 45
