Goal: Trying to improve recommendation system from first iteration. 

Challenges:
- Similar movies generated using user-based filtering do not seem similar. Similar movies usually occur very close to the target movie in the distribution of number of ratings. 
- Incorporate additional features to make recommendations more "interesting". Year of release? Tags?

In [None]:
import timeit
import pandas as pd
import os
import numpy as np

In [None]:
movies_file = 'data/ml-10M100K/movies.dat'
ratings_file = 'data/ml-10M100K/ratings.dat'

# set static variables
_pickle_dir = os.path.join(os.path.dirname(
    os.path.abspath('eda2.ipynb')), 'data\pickled')

# set appropriate paths for pickle files
_ratings_pickle_filename = 'movie_ratings.pkl'
ratings_pickle_path = os.path.join(_pickle_dir, _ratings_pickle_filename)
_movies_pickle_filename = 'movies.pkl'
movies_pickle_path = os.path.join(_pickle_dir, _movies_pickle_filename)

# try to load from pickle if available
try:
    # load movies df from pickle if it exists
    if os.path.exists(movies_pickle_path+"2"):
        movies_df = pd.read_pickle(movies_pickle_path)
    else :
        movies_df = pd.read_csv(movies_file, engine='python', encoding='utf-8',
                                        sep='::', header=None, names=['movie_id', 'name', 'genres'],
                                        dtype={'movie_id': np.int32, 'name': np.chararray, 'genres': np.chararray})

    # load ratings df from pickle if it exists
    if os.path.exists(ratings_pickle_path+"2"):
        ratings_df = pd.read_pickle(ratings_pickle_path)
    else:
        ratings_df = pd.read_csv(ratings_file, engine='python', encoding='utf-8',
                                        sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'],
                                        dtype={'user_id': np.int32, 'movie_id': np.int32, 'rating': np.int32, 'timestamp': np.float64})
        ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s', utc=True)
# catch exceptions
except Exception as e:
    print(e)


In [None]:
# merge dataframes using inner join
merged_df = pd.merge(ratings_df, movies_df, how="inner", on="movie_id")

# make a set of movie_id where movie has Children genre
# children_movie_ids = set(merged_df.loc[merged_df['genres'].str.contains('Children', na=False), 'movie_id'])
# print(children_movie_ids)

# drop unnecessary columns for performance
# merged_df.drop(['timestamp', 'genres'], axis=1, inplace=True)

# take the average if user has multiple reviews for movie
merged_df = merged_df.groupby(
    by=['movie_id', 'user_id'], as_index=False).agg({"rating": "mean"})

# pivot to make movies as rows and users as columns
merged_df = merged_df.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)

# print(merged_df.shape)


In [None]:
existing_df = pd.read_json('recs-app/public/top_rated_similars_1000.json', orient="index")
print(merged_df.loc[296])
print(existing_df.columns)
existing_df['similar_movie_ids'] = [[]] * len(existing_df)


Update similar movies using model using item-based filtering. Find the three most similar movies for each movie

In [None]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# use csr_matrix data type to represent pivot table of ratings as sparse matrix
sparse_item_df = csr_matrix(merged_df.values)
# create model from matrix
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(sparse_item_df)

for i in existing_df['movie_id'].values:
    weights, neighbors = model.kneighbors([merged_df.loc[i]], n_neighbors=50)
    neighbors = neighbors[0]  
    print(neighbors[:5])
    similar_movie_ids = list(merged_df.iloc[neighbors[1:4]].index)
    existing_df.at[existing_df[existing_df['movie_id'] == i].index[0], 'similar_movie_ids'] = similar_movie_ids

    for idx, n in enumerate(neighbors[1:4]):
        print(movies_df.loc[movies_df['movie_id'] == i]['name'].values[0],movies_df.loc[movies_df['movie_id'] == merged_df.index[n]]['name'].values[0])


Export the udpated

In [None]:
# save the updated dataframe
# existing_df.head()
# existing_df.to_json('data/top_rated_similars_1000.json', orient='index')


Examine the distribution of year of release and rating timestamps. It might be that distribution of ratings and year of release is skewed, influencing the recommendation model. Ideas to examine this:
- graph distributions
- compare descriptive statistics

In [None]:
import re
import matplotlib.pyplot as plt

# helper func to extract 4 digit year from movie title
def extract_year(name):
    pattern = r'\((\d{4})\)'
    match = re.search(pattern, name)
    if match:
        return int(match.group(1))
    else:
        return None

# create year column by applying extract_year
movies_df['year'] = movies_df['name'].apply(extract_year)
# merge movies and ratings
stats_df = pd.merge(movies_df, ratings_df, how='inner', on='movie_id')

# extract year values into list
years = movies_df['year'].values
# cast to pd Series and create histogram
pd.Series(years).hist(bins=50)

# set the plot title and axis labels
plt.title('Distribution of Movies by Release Year')
plt.xlabel('Year')
plt.ylabel('Frequency')

# show the plot
# plt.show()


In [None]:
import matplotlib.pyplot as plt

# print(ratings_df['timestamp'].head())
# pd.Series(ratings_df['timestamp']).hist(bins=50)

# Assume the timestamp column has already been converted to UTC datetime format
ratings_df['year'] = ratings_df['timestamp'].dt.year

# Plot a histogram of the year column
ratings_df['year'].hist()


# Incorporate tags into the recommendation system

In [167]:
# import tags by reading csv file
tags_df = pd.read_csv('data/ml-10M100K/tags.dat', sep='::', engine='python', encoding='utf-8', header=None, 
                      names=['user_id', 'movie_id', 'tag', 'timestamp'],
                      dtype={'user_id': np.int32, 'movie_id': np.int32, 'tag': np.chararray, 'timestamp': np.float64})
# drop columns that won't be used
tags_df.drop(['timestamp', 'user_id'], axis=1, inplace=True)

# convert all tags to upper for case-insensitive comparison
tags_df['tag'] = tags_df['tag'].str.upper()

# group by movie_id and tag and count occurrences
grouped = tags_df.groupby(['movie_id', 'tag']).size().reset_index(name='count')

# rank tags within each movie_id group
ranked = grouped.groupby('movie_id')['count'].rank(ascending=False)
grouped['rank'] = ranked.astype(int)

# sort by movie_id and count in descending order
sorted_grouped_tags = grouped.sort_values(by=['movie_id', 'count'], ascending=[True, False])

# # filter out tags that only occur once
sorted_grouped_tags = sorted_grouped_tags[sorted_grouped_tags['count'] > 1]

# group by movie_id and select top 3 tags for each group
top_3_tags = sorted_grouped_tags.groupby('movie_id').apply(lambda x: x.nlargest(3, 'count')).reset_index(drop=True)

# join with movie titles
top_3_tags = pd.merge(top_3_tags, movies_df, on='movie_id')

# display top 3 tags for each movie
for movie_id in existing_df['movie_id'][:10]:
    movie_title = movies_df[movies_df['movie_id'] == movie_id]['name']
    movie_tags = top_3_tags[top_3_tags['movie_id'] == movie_id]['tag']
    print(f"{movie_title.values[0]}: {', '.join(movie_tags)}")


Pulp Fiction (1994): QUENTIN TARANTINO, TARANTINO, SAMUEL L. JACKSON
Forrest Gump (1994): TOM HANKS, OSCAR (BEST PICTURE), VIETNAM
Silence of the Lambs, The (1991): SERIAL KILLER, CANNIBALISM, JODIE FOSTER
Jurassic Park (1993): DINOSAURS, STEVEN SPIELBERG, ACTION
Shawshank Redemption, The (1994): PRISON, STEPHEN KING, CLASSIC
Braveheart (1995): MEL GIBSON, DRAMA, OVERRATED
Fugitive, The (1993): ACTION, HARRISON FORD, CHASE
Terminator 2: Judgment Day (1991): TIME TRAVEL, ACTION, ROBOTS
Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977): SCI-FI, GEORGE LUCAS, SPACE
Apollo 13 (1995): SPACE, TOM HANKS, TRUE STORY


In [None]:
# select distinct tags for two similar movies
toy_story_tags = set(tags_df[tags_df['movie_id'] == 1]['tag'].values)
lion_king_tags = set(tags_df[tags_df['movie_id'] == 364]['tag'].values)

# case insensitive comparison
toy_story_tags = set([tag.upper() for tag in toy_story_tags])
lion_king_tags = set([tag.upper() for tag in lion_king_tags])

print('Toy Story Tags: ', sorted(list(toy_story_tags)))
print('Lion King Tags: ', sorted(list(lion_king_tags)))

# observe the tags that are shared
print('Shared by both movies: ', toy_story_tags.intersection(lion_king_tags))


The meaning is not clear or irrelevant for some, like  `want`, `Tumey's VHS`, and `AVI `. 

Some of these tags are redundant with the genres listed for each movie, like `Animated`, `Animation`, `Children`.
 
Useful tags represent meaningful characteristics of the movie, like `Disney`.

Now comparing the shared tags and their counts, examine which look like they are likely useful

In [168]:
movie_id1 = 50 # The Usual Suspects
movie_id2 = 296 # Pulp Fiction

tags1 = sorted_grouped_tags[sorted_grouped_tags['movie_id'] == movie_id1]
tags2 = sorted_grouped_tags[sorted_grouped_tags['movie_id'] == movie_id2]

tags1 = tags1['tag'].values
for idx, row in tags2.iterrows():
    if row['tag'] in tags1:
        print(f' {row["tag"]}, {row["count"]}  is shared')


 CRIME, 10  is shared
 ORGANIZED CRIME, 5  is shared
 DVD, 3  is shared
 SEEN AT THE CINEMA, 2  is shared


For **The Usual Suspects** and **Pulp Fiction**, it looks like `CRIME` and `ORGANIZED CRIME` are shared tags that are meaningful. `DVD` and `SEEN AT THE CINEMA` are not.

`CRIME` is a substring of `ORGANIZED CRIME`, so filtering out substrings might not be the best approach. 

The top 3 tags for each movie as well as their similar movies' shared tags should be appended to the metadata .json for observation in the app.

In [179]:
# load existing top rated movies metadata
existing_df = pd.read_json('recs-app/public/top_rated_similars_1000.json', orient='index')

# create a new DataFrame to store the modified values
new_df = pd.DataFrame(columns=existing_df.columns)

# iterate through movies
for idx, row in existing_df.iterrows():
    similar_movie_tags = list()
    # iterate through each similar movie
    for sim_movie_id in row['similar_movie_ids']:
        sim_tags = list()
        tags1 = sorted_grouped_tags[sorted_grouped_tags['movie_id'] == row['movie_id']]
        tags2 = sorted_grouped_tags[sorted_grouped_tags['movie_id'] == sim_movie_id]

        tags1 = tags1['tag'].values
        for idx, row2 in tags2.iterrows():
            if len(sim_tags) == 3:
                continue
            if row2['tag'] in tags1:
                sim_tags.append(row2['tag'])
        similar_movie_tags.append(sim_tags)
    row['similar_movie_tags'] = similar_movie_tags
    # add the modified row to the new DataFrame
    new_df = new_df.append(row)

# replace the original DataFrame with the new one
existing_df = new_df
print(existing_df.head())

existing_df.to_json('data/metadata/top_rated_similars3_1000.json', orient='index')


  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df = new_df.append(row)
  new_df =

      substituted_name                                        description  \
0   Fruit Juice (1994)  This film follows the intertwining lives of va...   
1          Active Hill  A man with limited intelligence experiences se...   
2  Hushes of the Lambs  A trainee of the FBI enlists the aid of an int...   
3      Dinosaur Island  A team of experts visits an island where prehi...   
4         Prison Break  A man serving a life sentence befriends a fell...   

   avg_rating num_ratings                              name  \
0    4.091757       34864               Pulp Fiction (1994)   
1    3.942740       34457               Forrest Gump (1994)   
2    4.136985       33668  Silence of the Lambs, The (1991)   
3    3.590206       32631              Jurassic Park (1993)   
4    4.389771       31126  Shawshank Redemption, The (1994)   

                             genres movie_id ranking similar_movie_ids  \
0                Comedy Crime Drama      296       1    [593, 318, 50]   
1         

  new_df = new_df.append(row)
  new_df = new_df.append(row)
