In this project we propose to create an item-item recommendation system that will assist users in selecting the next title to watch based on their input of a specific title. We will adopt this approach instead of building user profiles, so the imputed title will be used in lieu of a user profile, however user profile data is available that we may incorporate into our project later, time allowing.

We have begun setting up our item-item system, but we have not progressed very far. We are currently using a truncated version of our ratings file.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast

ratings_df = pd.read_csv("./input/ratings_small.csv", dtype = 'float')
# ratings_df = pd.read_csv("./input/ratings.csv", dtype = 'float')

ratings_df.head()

movies_df = pd.read_csv("./input/movies_metadata.csv")
movies_df.head()

movies_df = movies_df[movies_df['id'].astype(str).str.isdigit()] # clean (Some movies have not integers as id)
movies_df.id = movies_df.id.astype(np.int64)

ratings_df = pd.merge(ratings_df,movies_df[['title','id']],left_on='movieId',right_on='id')
ratings_df.head()

ratings_df.drop(['timestamp','id'],axis=1,inplace=True)

titles = ratings_df["title"].unique()
titlesRate = []
titlesVote = []

for t in range(len(titles)):
    temp = ratings_df[(ratings_df["title"] == titles[t])]
    v = temp['rating'].mean()
    titlesVote.append(temp['rating'].count())
    titlesRate.append(v)

data = {'Title':  titles,
        'Rating': titlesRate,
        'Votes': titlesVote
       }
Leaderboard = pd.DataFrame (data, columns = ['Title','Rating','Votes'])


  interactivity=interactivity, compiler=compiler, result=result)


KeyboardInterrupt: 

In [None]:
# Highly rated movies
print("Top rated movies")
Leaderboard = Leaderboard.sort_values(by=['Rating','Votes'], ascending = False).reset_index(drop = True)
print(Leaderboard.head(10))

print("Most viewed movies")
# Popular
Leaderboard = Leaderboard.sort_values(by=['Votes','Rating'], ascending = False).reset_index(drop = True)
print(Leaderboard.head(10))

In [None]:
top_rated_movies = Leaderboard.sort_values(by=['Rating','Votes'], ascending = False).reset_index(drop = True)
top_rated_movies_y = sorted(top_rated_movies['Rating'].values.tolist())
top_rated_movies_x = [i for i in range(len(top_rated_movies_y))]

plt.bar(top_rated_movies_x, top_rated_movies_y)

plt.title('Rankings of movies')
plt.xlabel("Movie Number")
plt.ylabel("Movie Rating")
plt.show()

top_voted_movies_y = sorted(top_rated_movies['Votes'].values.tolist())
top_voted_movies_x = [i for i in range(len(top_rated_movies_y))]
plt.bar(top_voted_movies_x, top_voted_movies_y, color = 'red')

plt.title('Votes of movies')
plt.xlabel("Movie Number")
plt.ylabel("Number of Votes")
plt.show()


As you can see, we have cleaned and merged some data from our csv files. From these we calculated the most popular and high rated titles, as well as averages of the like. This is very important for recomendations! The better rated and ranked an item, the more likely it is to be recommended. It also is important to think about which items are popular in general and which items are specifically. Moving foward we will have to  run knn on the movies. Where the closest k movies will be suggested. Perhaps we can incorporate the rating of the movie in the way we list the movie recommendations to the user. 

In [None]:
import re
import unidecode

movie_text = movies_df['overview'].values.tolist()
movie_text_check = movie_text.copy()
# clean up punctuation
movie_text = [set(re.sub(r'[^a-zA-Z0-9\s]','', unidecode.unidecode(str(movie_text_check[i]))).split()) for i in range(len(movie_text))]

# movie_text = [set(str(movie_text[i]).split()) for i in range(len(movie_text))]

movie_title = movies_df['original_title'].values.tolist()
movie_title = [movie_title[i] for i in range(len(movie_title))]
movie_title_set = set(movie_title)

movie_word_set = {movie_title[i]: movie_text[i] for i in range(len(movie_title))}
check_words = {movie_title[i]: movie_text_check[i] for i in range(len(movie_title))}


In [None]:
# similarity based off similarity of movie descriptions, Uses jaccard similarity on 1 word grams
def description_similarity(title,toSort = False, title_set = movie_title_set, return_scores = False, return_text = False, return_all = False):
    if title not in title_set:
        print("Title not found!")
        return []
    
    else:
        # get descirption of corresponding movie form dictionary
        title_set = movie_word_set[title]

        current_leader = []
        for j, m_title in enumerate(movie_title):
            # jaccard similarity
            description_sim = len(movie_text[j].intersection(title_set))/len(movie_text[j].union(title_set))
    
            if return_text:
                current_leader.append((movie_title[j],description_sim, movie_text[j]))
            else:
                current_leader.append((movie_title[j], description_sim))
                
        if toSort:  
            current_leader.sort(key = lambda x:x[1], reverse = True)

        # only use top 10 similar movies
        if return_all==False:
            current_leader = current_leader[:10]
            
        if return_scores or return_text:
            return current_leader
        
        # return just the titles
        else:
            return [x[0] for x in current_leader]
        
title = "Toy Story"
temp = np.array(description_similarity(title, return_scores = True, return_all=False))
# print([(x[0], x[1]) for x in temp])
print(temp)



In [None]:
title1 = 'Toy Story'
if title1 in check_words:
    print(check_words[title1])
print()
title2 = 'Toy Story 2'
if title2 in check_words:
    print(check_words[title2])
        
ii = movie_title.index(title1)
title_set = movie_text[ii]

iii = movie_title.index(title2)
title_set_two = movie_text[iii]

print(title_set.intersection(title_set_two))
print()
print(len(title_set.intersection(title_set_two)),len(title_set.union(title_set_two)))
# print(len(title_set.union(title_set_two)),len(title_set) + len(title_set_two))

In [None]:
collection = movies_df['belongs_to_collection']
collection_titles = movies_df['original_title'].values.tolist()
collection_store = {collection_titles[i] : ast.literal_eval(c)['name'] if isinstance(c, str) else None for i,c in enumerate(collection.values.tolist())}

# similarity, binary, 1 = same collection, 0 = not same collection
def collection_similarity(title, toSort = False, title_collection = None, return_all = False):
    temp = []
    
    if title_collection is None:
            title_collection = collection_store[title]
            
    for i in range(len(movie_title)):

        if collection_store[movie_title[i]] is not None:           
            if collection_store[movie_title[i]] != title_collection:
                temp.append((movie_title[i],0))
                
            else:
                # movies are in the same collection
                temp.append((movie_title[i],1))                
                
        else:
            temp.append((movie_title[i],0)) # unknown collection/no collection
            
    if toSort:  
        temp.sort(key = lambda x:x[1], reverse = True)
    
    if return_all:
        return temp
    else:
        return temp[:10]
        
temp = np.array(collection_similarity('Toy Story'))
print(temp)

In [None]:
adult = movies_df['adult'].values.tolist()
adult_store = {collection_titles[i] : c for i,c in enumerate(adult)}

# similarity, binary, 1 = same, 0 = not same 
def adult_similarity(title,toSort = False, title_collection = None, return_all = False):
    temp = []
    
    if title_collection is None:
            title_collection = adult_store[title]
            
    for i in range(len(movie_title)):
            
        if adult_store[movie_title[i]] is not None:           
            if adult_store[movie_title[i]] != title_collection:
                temp.append((movie_title[i],0))
            else:
                temp.append((movie_title[i],1)) # same maturity rating

        else:
            temp.append((movie_title[i],0)) # no associated maturity rating
 
    if toSort:  
        temp.sort(key = lambda x:x[1], reverse = True)
    
    if return_all:
        return temp
    else:
        return temp[:10]
        
temp = np.array(adult_similarity('Toy Story'))
print(temp)

In [None]:
language = movies_df['original_language'].values.tolist()
language_store = {collection_titles[i] : c for i,c in enumerate(language)}

# similarity, binary, 1 = same, 0 = not same 
def language_similarity(title, toSort = False,title_collection = None, return_all = False):
    temp = []
    
    if title_collection is None:
            title_collection = adult_store[title]
            
    for i in range(len(movie_title)):
            
        if language_store[movie_title[i]] is not None: 
            
            if language_store[movie_title[i]] == title_collection:
                temp.append((movie_title[i],0))
                
            else:
                temp.append((movie_title[i],1))
        else:
            temp.append((movie_title[i],1)) # no language associated we treat as the same language
            
    if toSort:  
        temp.sort(key = lambda x:x[1], reverse = True)
    
    if return_all:
        return temp
    else:
        return temp[:10]
        
temp = np.array(language_similarity('Toy Story'))
print(temp)


In [None]:
genre = movies_df['genres'].values.tolist()
genre_list = [ast.literal_eval(c) for i,c in enumerate(genre)]

genre_store = {}

for i,arr in enumerate(genre_list):
    genre_store[movie_title[i]] = set()
    for d in arr:
        genre_store[movie_title[i]].add(d['name'])

# similarity jaccard, genre tags for movie
def genre_similarity(title,toSort = False, title_collection = None, return_all = False):
    temp = []
    
    if title_collection is None:
            title_collection = genre_store[title]
            
    for i in range(len(movie_title)):
            
        if genre_store[movie_title[i]] is not None:  
            sim = len(genre_store[movie_title[i]].intersection(genre_store[title]))/(len(genre_store[movie_title[i]].union(genre_store[title])))
            temp.append((movie_title[i],sim))
          
        else:
            temp.append((movie_title[i],1)) # no genres associated with movie, count as a match for the movie
    
    if toSort:  
        temp.sort(key = lambda x:x[1], reverse = True)
    
    if return_all:
        return temp
    else:
        return temp[:10]
        
temp = np.array(genre_similarity('Batman v Superman: Dawn of Justice'))
print(temp)
print(genre_store['Batman v Superman: Dawn of Justice'], genre_store['The Scorpion King'])

In [None]:
# Combine all the features for a weighted similarity score, will show the top n movies and corresponding score
def combine(title, scores = [2,2,2,2,2], n = 10):
    # all things are stored as tuples (movie, score)
    if not title in movie_title:
        print('"',title,'" is not part of our database')
        return []
    collection = collection_similarity(title, return_all = True) # 1 means same collection
    description = description_similarity(title, return_scores = True, return_all = True) # closer 1 means more similar
    language = language_similarity(title, return_all = True) # 1 means same language
    adult = adult_similarity(title, return_all = True) # 1 means same maturity
    genre = genre_similarity(title, return_all = True) # closer 1 means more similar

    temp = []
    
    for i in range(len(movie_title)-2):
        # Do not recommend the movie title inputed
        if movie_title[i] == title or movie_title[i] not in adult_store:
            continue

        sim = scores[0]*collection[i][1] + scores[1]*description[i][1] + scores[2]*language[i][1] + scores[3]*adult[i][1] + scores[4]*genre[i][1]
        temp.append((movie_title[i], np.round(sim*100/np.sum(scores),4)))

    temp.sort(key = lambda x:x[1], reverse = True)
    
    return temp[:n]

temp = np.array(combine("Toy Story"))
print(temp)

In [None]:
temp = np.array(combine("Despicable Me"))
print(temp)

In [None]:
temp = np.array(combine("Her"))
print(temp)