In [1]:
# User-Based Collaborative Filtering
import pandas as pd
import numpy as np
import os

# Set working directory to wherever your project4 folder is
wd = "/Users/yueyingteng/Documents/2016.1/applied data science/project 4 "
os.chdir(wd)
os.getcwd()


'/Users/yueyingteng/Documents/2016.1/applied data science/project 4 '

In [2]:
movies_list = pd.read_csv("./movies_filtered.csv")
movies_list = list(movies_list['product_productid']) # Read in a list of movies in the data
users = pd.read_csv("./users_filtered.csv") # Read in the user table
users = users.fillna("") # Change NaN to ""
users_colnames = list(users) # Get column names
# len(users)

In [3]:
# Create a dictionary with indices as keys and user IDs as values
userids = {}
for i in range(len(users)):
    userids[i] = users['review_userid'][i]

# Create a dictionary:
#     keys are user IDs
#     values are themselves dictionaries
#         for each value-dictionary, keys are movie IDs, values are review scores
# eg. {'A123456789': {'B00000000': 5, 'B000099999': 3}, 'A987654321': {'B00000000': 4}} 
# Access a particular user's dictionary via users_data[userids][0] or users_data['A123456789']

users_data = {}
for user in range(len(users)):
    users_data[userids[user]] = {}    
    star = 1
    for col in range(14, 19):        
        for movie in users[users_colnames[col]][user].split():
            users_data[userids[user]][movie] = star
        star += 1
        #print((user, col))
    

In [4]:
def cossimilarity(ratings1, ratings2):
    '''
    Computes cosine similarity between two users' ratings.
    Only applies to movies that the two users have rated in common.
    '''
    
    x = []
    y = []
    n = 0
    for key in ratings1:
        if key in ratings2:
            n += 1
            x.append(ratings1[key])
            y.append(ratings2[key])
    if n == 0:
        return 0
    x = np.asarray(x)
    y = np.asarray(y)
    if np.dot(x, x) == 0 or np.dot(y, y) == 0:
        return 0
    result = np.dot(x, y) / np.sqrt(np.dot(x, x) * np.dot(y, y))
    return result    
        

In [5]:
def most_similar_users(username, userids):
    ''' 
    Returns the Pearson correlation between a given user and all others
    Input: username, dict
    Input: userids, dict
    Output: list
    '''
    distances = []
    for user in range(len(userids)):
        if userids[user] != username:
            distance = cossimilarity(users_data[userids[user]], users_data[username])
            intersection = list(set(users_data[userids[user]].keys()) & set(users_data[username].keys()))
            distances.append((round(distance, 2), len(intersection), userids[user]))
    # sort based on distance - closest first
    distances.sort(reverse = True)
    return distances

In [6]:
def recommend(username, userids):
    '''
    Recommends 3 movies based on user-based collaborative filtering.
    Input: username, str, e.g. 'A2OXDJP1Z3LNOK', must be in userids dict
    Input: userids, dict
    Output: list of 3 movies
    '''
    similar_users = most_similar_users(username, userids)
    
    # Obtain the set of all movies seen by similar users and not yet seen by new user
    new_movies = set()
    if len(users_data[username]) == 0:
        return "Cannot recommend without any ratings"
    elif len(users_data[username]) < 10:
        k = len(users_data[username])
    else:
        k = 10
    for similar_user in range(k):
        new_movies = new_movies | set(users_data[similar_users[similar_user][2]].keys()) - set(users_data[username].keys())
    new_movies = list(new_movies)
        
    # Create a matrix with the score for each user-movie combination
    # Weight each score by number of movies new user has in common with similar user
    #     multiplied by the Pearson correlation coefficient
    score_matrix = np.zeros((k, len(new_movies)))  
    for i in range(len(new_movies)):
        for similar_user in range(k):
            if new_movies[i] in users_data[similar_users[similar_user][2]]:
                score_matrix[similar_user, i] = users_data[similar_users[similar_user][2]][new_movies[i]] * similar_users[similar_user][0] * similar_users[similar_user][1]
    ranking = score_matrix.mean(axis = 0)
    if sum(ranking) == 0:
        return "Cosine similarity value is 0 for all users"
    
    # Obtain the top 3 UNIQUE scores and match them to movies
    # (Not specifically looking for UNIQUE scores will yield duplicate movies)
    # Amazon has different ASINs for different versions of the same movie
    # e.g. VHS, DVD, Anniversary Edition, web streaming, etc.
    # Those different versions will all share the same reviews
    top_3_scores = np.unique(ranking)
    recommended_movies = []
    for i in range(3):
        recommended_movies.append(new_movies[np.where(ranking == top_3_scores[-i-1])[0].tolist()[0]])
    return(recommended_movies)


In [17]:
# use API to change entered movie name keywords to ASIN 
# and put ASIN into the recommendation system to get three movie ASIN as recommendation
# use API to change recommended movie ASIN to movie Title and URL
# the following are done for newuser1

# for example the newuser gives the following keywords: 
# 5 stars to Batman - Mask of the Phantasm 
# 4 stars to Harry Potter: Years 1-5
# 3 stars to the mask
# 2 stars to Spirited Away, 
# 1 stars to the lord of the rings: the return of the kings

# store these keywords in a list caled keywords

# keywords to ASIN
from amazonproduct import API
api = API("AKIAJGEEABW2F4H7ZB4Q", "6+ShIy2suLuPzWOdhEbzA8y4Cd3QDdfzokAbILB1","us","yueyingteng-20")

ASIN = {}
keywords = ["The Last Samurai","Harry Potter: Years 1-5","World Cup Soccer in Africa: Who Really Wins","spirited away","the Walking Dead"]
for keyword in keywords:
    ASIN[keyword] = []
    results = api.item_search('DVD', Title = keyword)
    for item in results:
        item =  item.ASIN
        ASIN[keyword].append(item)
        

# userids[len(userids)] = 'newuser1'
# We don't need this if we're not going to create a new username, we can just overwrite the original 'newuser1'
        
def create_new_user_data(username, keywords, ratings):
    '''
    Input: username, str, e.g. 'A2OXDJP1Z3LNOK', must be in userids dict
    Input: keywords, list of keywords used to index ASIN dictionary
    Input: ratings, list of ratings for each movie
    Output: new entry in users_data for username
    
    Example: create_new_user_data('newuser3', ['the mask', 'harry potter'], [5, 4])
    '''
    empty_dict = {}
    for i in range(len(keywords)):
        # if there are no ASINs in common between the Amazon API results and our data, do not create an entry
        if len(set(ASIN[keywords[i]]) & set(movies_list)) == 0:
            continue
        else:
            # get the first entry from the intersection of the Amazon API results and the ASINs in our data
            empty_dict[list(set(ASIN[keywords[i]]) & set(movies_list))[0]] = ratings[i]
    users_data[username] = empty_dict

create_new_user_data('newuser1', keywords, [5, 4, 3, 2, 1])



In [18]:
# recommendation. Inputs: 2 stars to Spirited Away, 5 stars to Batman - Mask of the Phantasm and
# 4 stars to Harry Potter: Years 1-5
testrun = recommend('newuser1', userids)
print testrun
# It recommends Little Miss Sunshine, Good Night and Good Luck [HD DVD] and Taken

['B000MR1V22', 'B000V7O0IK', 'B0026145WK']


In [19]:
# ASIN to keywords
# the result is a dictioanry with recommended movie ASIN as key and movie titel, movie URL and movie poster URL as entries

from amazonproduct import API 
api = API("AKIAJGEEABW2F4H7ZB4Q", "6+ShIy2suLuPzWOdhEbzA8y4Cd3QDdfzokAbILB1","us","yueyingteng-20")

movies = {}
for movie in testrun:
    movies[movie] = []
    #result = api.item_lookup(str(movie))
    for item in api.item_lookup(str(movie)).Items.Item:
        title = item.ItemAttributes.Title 
        URL = item.ItemLinks.ItemLink.URL
        movies[movie].append(title)
        movies[movie].append(URL)
    #result2 = api.item_lookup(str(movie), ResponseGroup='Images')
    for items in api.item_lookup(str(movie), ResponseGroup='Images').Items.Item:
        imageURL = items.ImageSets.ImageSet.LargeImage.URL
        movies[movie].append(imageURL)
movies



{'B000MR1V22': ['Little Miss Sunshine',
  'http://www.amazon.com/Little-Miss-Sunshine-Abigail-Breslin/dp/tech-data/B000MR1V22%3FSubscriptionId%3DAKIAJGEEABW2F4H7ZB4Q%26tag%3Dyueyingteng-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB000MR1V22',
  'http://ecx.images-amazon.com/images/I/41B-tctwtQL.jpg'],
 'B000V7O0IK': ['Good Night and Good Luck [HD DVD]',
  'http://www.amazon.com/Good-Night-Luck-HD-DVD/dp/tech-data/B000V7O0IK%3FSubscriptionId%3DAKIAJGEEABW2F4H7ZB4Q%26tag%3Dyueyingteng-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB000V7O0IK',
  'http://ecx.images-amazon.com/images/I/41b5Rn6u-IL.jpg'],
 'B0026145WK': ['Taken',
  'http://www.amazon.com/Taken-Liam-Neeson/dp/tech-data/B0026145WK%3FSubscriptionId%3DAKIAJGEEABW2F4H7ZB4Q%26tag%3Dyueyingteng-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB0026145WK',
  'http://ecx.images-amazon.com/images/I/41F9bWSlZ5L.jpg']}

In [20]:
# print the movies Title of the first recommended movie 
print movies[testrun[0]][0]
# print the movie URL of the first recommended movie 
print movies[testrun[0]][1]
# print the movie poster URL of the first recommended movie 
print movies[testrun[0]][2]

Little Miss Sunshine
http://www.amazon.com/Little-Miss-Sunshine-Abigail-Breslin/dp/tech-data/B000MR1V22%3FSubscriptionId%3DAKIAJGEEABW2F4H7ZB4Q%26tag%3Dyueyingteng-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB000MR1V22
http://ecx.images-amazon.com/images/I/41B-tctwtQL.jpg


In [259]:
"B005ZMUZDU" in movies_list

True

In [292]:
# Demonstration of the recommend() function:
# 1. On an existing user
demo = recommend('A2OXDJP1Z3LNOK', userids)
print(demo)
# It recommends Goodfellas, Sunset Boulevard, and The Silence of the Lambs
                        
# 2. On a new user, generated with fake data
# When generating a new user, you must first give them an ID (you only need to do this once per new user) and then
# add them to users_data as follows:
userids[len(userids)] = 'newuser1'
users_data['newuser1'] = {"078062551X": 5, "6301972066": 4, "B00005JLZK": 3, "B000W4HJ44": 2, "B0060D38EQ": 1}

demo2 = recommend('newuser1', userids)
print(demo2)
# It recommends Final Destination 2, The Devil Wears Prada, and Erin Brokovich

['B000P0J09M', 'B007HEOTHC', '6304524455']
['B000YHDDBC', 'B000J103PC', 'B00004U2N5']
