In [1]:
import pandas as pd 
import numpy as np
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer 

## Calculate TF-IDF Matrix

In [2]:
movies_df = pd.read_csv("generated_data/movies_small.csv")
movies_df = movies_df.set_index("movieId")
movies_df = movies_df[~movies_df.index.duplicated(keep="first")] # drop rows with duplicate indices

In [3]:
movies_df

Unnamed: 0_level_0,adult,belongs_to_collection,budget,genres,homepage,original_language,original_title,overview,popularity,poster_path,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
2,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
3,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
4,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
5,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161918,False,"{'id': 286023, 'name': 'Sharknado Collection',...",0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",http://www.syfy.com/sharknado4,en,Sharknado 4: The 4th Awakens,The new installment of the Sharknado franchise...,4.574494,/jcP3HFXF1BIW9LmBrDusbbDZjBG.jpg,...,2016-07-31,0.0,85.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"What happens in Vegas, stays in Vegas. Unless ...",Sharknado 4: The 4th Awakens,False,4.3,88.0
161944,False,,8000000,"[{'id': 18, 'name': 'Drama'}]",,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,0.038998,/yWp7PgydSlxlhl7benKhTnCvRjN.jpg,...,2001-09-23,0.0,85.0,[],Released,,The Last Brickmaker in America,False,7.0,1.0
162542,False,,1000000,"[{'id': 53, 'name': 'Thriller'}, {'id': 10749,...",,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",7.333139,/q1lrN6ZrIsOs077lQB86aPGKZRF.jpg,...,2016-08-12,0.0,150.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,Decorated Officer. Devoted Family Man. Defendi...,Rustom,False,7.3,25.0
162672,False,,15050000,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",1.423358,/q2XVemXiWSa18mbaVpI3rbLXG2u.jpg,...,2016-08-11,16180000.0,155.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Released,,Mohenjo Daro,False,6.7,26.0


In [4]:
# compute TFIDF matrix for movie overviews 
vectorizer = TfidfVectorizer(analyzer="word", stop_words="english")
matrix = vectorizer.fit_transform(movies_df["overview"])
tokens = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(data=matrix.toarray(), index=movies_df.index, columns=tokens)

## Predict Ratings

In [5]:
ratings_df = pd.read_csv("data/ratings_small.csv")
ratings_df = ratings_df[ratings_df["movieId"].isin(movies_df.index)]

In [6]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [7]:
# store similarity scores to avoid re-computation
seen = {} 

In [8]:
def make_predictions(user_id):
    assert(user_id in ratings_df["userId"].unique())
    
    # use previous ratings to predict future ratings
    sample = ratings_df[ratings_df["userId"] == user_id].sort_values("timestamp")
    
    n = int(len(sample) * 0.80) # 80-20 split for train-test data
    train_df = sample.iloc[:n, :].copy()
    test_df = sample.iloc[n:, :].copy()
    
    previous_ratings = train_df[["movieId", "rating"]].set_index("movieId")
    seen_movies = previous_ratings.index
    
    def func(row):
        my_id = int(row.movieId)
        weighted_rating = 0 
        total_cosine_similarity = 0 
        
        # predict rating by weighting ratings of previously seen movies with cosine similarity score
        for other_id in seen_movies:
            other_id = int(other_id)
            if (my_id, other_id) in seen:
                cosine_similarity = seen[(my_id, other_id)]
            elif (other_id, my_id) in seen: 
                cosine_similarity = seen[(other_id, my_id)]
            else:
                cosine_similarity = 1 - cosine(tfidf_df.loc[my_id, :], tfidf_df.loc[other_id, :])
                seen[(my_id, other_id)] = cosine_similarity 
                
            weighted_rating += previous_ratings.loc[other_id, "rating"] * cosine_similarity
            total_cosine_similarity += cosine_similarity
        
        # if movie is not similar to any previously seen movies, use average rating as prediction
        if total_cosine_similarity == 0:
            return train_df["rating"].mean()
        
        return weighted_rating / total_cosine_similarity
    
    test_df["predicted_rating"] = test_df.apply(func, axis=1)
    return test_df

In [9]:
# evaluate model for a single user by calculating RMSE and how many good recommendations were made 
def evaluate_predictions(df, verbose=True):    
    predicted_rmse = np.sqrt(np.square(test_df["rating"] - test_df["predicted_rating"]).mean())
    
    movies_sorted_by_rating = test_df.sort_values("rating").index
    movies_sorted_by_predicted_rating = test_df.sort_values("predicted_rating").index
    
    actual_top_3_movies = set(movies_sorted_by_rating[-3:])
    predicted_top_3_movies = set(movies_sorted_by_predicted_rating[-3:])
    n = len(actual_top_3_movies.intersection(predicted_top_3_movies))
    
    if verbose:
        print("Predicted RMSE", predicted_rmse)
        print("Movies sorted by rating, ascending", movies_sorted_by_rating.to_list())
        print("Movies sorted by predicted rating, ascending", movies_sorted_by_predicted_rating.to_list())
        print(f"Correctly recommended {n} of the top 3 movies")
    
    return predicted_rmse, n

In [10]:
test_df = make_predictions(1)

In [11]:
test_df

Unnamed: 0,userId,movieId,rating,timestamp,predicted_rating
15,1,2193,2.0,1260759198,2.822113
18,1,2968,1.0,1260759200,3.137708
11,1,1405,1.0,1260759203,3.137676
4,1,1172,4.0,1260759205,2.0


In [12]:
evaluate_predictions(test_df);

Predicted RMSE 1.8584484398142926
Movies sorted by rating, ascending [18, 11, 15, 4]
Movies sorted by predicted rating, ascending [4, 15, 11, 18]
Correctly recommended 2 of the top 3 movies


In [13]:
# evaluate model over all users
total_predicted_rmse = 0 
total_n = 0
count = 0 

for user_id in ratings_df.userId.unique():
    try:
        test_df = make_predictions(user_id)
        predicted_rmse, n = evaluate_predictions(test_df, verbose=False)
        total_predicted_rmse += predicted_rmse 
        total_n += n 
        count += 1
    except:
        print(f"Encountered error for user {user_id}")
    
average_predicted_rmse = total_predicted_rmse / count
average_n = total_n / count

print("Averaged predicted RMSE", average_predicted_rmse)
print(f"Correctly recommended {average_n} of the top 3 movies on average")

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)


Averaged predicted RMSE 0.9760713328324635
Correctly recommended 0.8450074515648286 of the top 3 movies on average
