# Deisgn of AI systems

Building a recommender system using netflix ratings and users as input

## Assignment 2, Eric Johansson & Max Sonnelid

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [47]:
from sklearn.model_selection import train_test_split
  
# Read the CSV file.
movie_genres = pd.read_csv("data/movie_genres.csv", skiprows=0)
user_reviews = pd.read_csv("data/user_reviews.csv", skiprows=0)

In [48]:
# Getting an understanding of the two data sets

user_reviews.head()
#movie_genres.head()

Unnamed: 0.1,Unnamed: 0,User,The Net,Happily N'Ever After,Tomorrowland,American Hero,Das Boot,Final Destination 3,Licence to Kill,The Hundred-Foot Journey,...,The Martian,Micmacs,Solomon and Sheba,In the Company of Men,Silent House,Big Fish,Get Real,Trading Places,DOA: Dead or Alive,Hey Arnold! The Movie
0,0,Vincent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Edgar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Addilyn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Marlee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Javier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Creating a list of all movies

movie_names = list(user_reviews.columns.values) 
movie_names = movie_names[2:] # Removing the first two indecies since they are not movie titles.

In [50]:
# Checking how many ratings each movie have.

rating_per_movies = []
for names in movie_names:
    rating_per_movies = np.append(rating_per_movies,user_reviews[names].sum())

rating_per_movies = np.vstack((movie_names, rating_per_movies)).T
print(rating_per_movies)

[['The Net' '58.0']
 ["Happily N'Ever After" '36.0']
 ['Tomorrowland' '40.0']
 ...
 ['Trading Places' '10.0']
 ['DOA: Dead or Alive' '28.0']
 ['Hey Arnold! The Movie' '29.0']]


In [51]:
# Checking the average value of each movie

average_rating = []
for names in movie_names:
    df = user_reviews.loc[(user_reviews[names]>0)]
    df = df[names]
    average_rating = np.append(average_rating,sum(df)/len(df))

average_rating = np.vstack((movie_names,average_rating)).T
print(average_rating)

[['The Net' '4.142857142857143']
 ["Happily N'Ever After" '3.6']
 ['Tomorrowland' '3.6363636363636362']
 ...
 ['Trading Places' '3.3333333333333335']
 ['DOA: Dead or Alive' '3.111111111111111']
 ['Hey Arnold! The Movie' '3.625']]


In [52]:
# Creating a dataframe of the data above.

df = pd.DataFrame({'Average_Rating':average_rating[:,1], 'Movie': average_rating[:,0]})
df['Average_Rating'] = df['Average_Rating'].astype(float)
df = df.sort_values(by=['Average_Rating'], ascending = False)
#df2 = df.loc[(df['Importance']>0)]
df

Unnamed: 0,Average_Rating,Movie
1346,5.000000,The Tempest
1971,5.000000,United 93
1550,5.000000,Edtv
630,4.909091,Chill Factor
1388,4.900000,The Hunting Party
...,...,...
1930,1.571429,One Missed Call
368,1.500000,Doc Holliday's Revenge
244,1.500000,The Final Destination
383,1.500000,Frenzy


# Building a model to predict ratings

In [53]:
import pandas as pd
import numpy as np

In [54]:
from sklearn.model_selection import train_test_split
  
# Read the CSV file.
movie_genres = pd.read_csv("data/movie_genres.csv", skiprows=0)
user_reviews = pd.read_csv("data/user_reviews.csv", skiprows=0)

In [55]:
user_reviews.drop('User', axis =1, inplace = True)
user_reviews.drop('Unnamed: 0', axis =1, inplace = True)


In [56]:
from scipy.sparse import csr_matrix

user_reviews
mat_user_reviews=csr_matrix(user_reviews.values)
mat_user_reviews = np.transpose(mat_user_reviews)

In [57]:
from sklearn.neighbors import NearestNeighbors

model_knn= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_knn.fit(mat_user_reviews)

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)

In [58]:
def retrieveIndex(name):
    for i in range(0,len(movie_names)-1):
        if movie_names[i]==name:
            return i

In [59]:
def recommender(movie_name, nr_of_suggestions):
    movie_index = retrieveIndex(movie_name)
    distances, indices = model_knn.kneighbors(mat_user_reviews[movie_index], n_neighbors=nr_of_suggestions+1)
    indices = indices[0]
    distances = distances[0]
    distances = distances[1:]
    indices = indices[1:]
    movie_suggestion = []
    for i in indices:
        movie_suggestion = np.append(movie_suggestion, movie_names[i])
    #print("Your recommended movies are:")
    #for j in range(0,len(movie_suggestion)-1):
        #print(movie_suggestion[j])
        #print("Relation to watched movie")
        #print(distances[j])

    return movie_suggestion, distances

In [60]:
## Adding weights to a movie recommendation given its relateds movies rating
def rateWeight (rating, suggestions, distances):
    res = [[]] * 0
    for i in range(0,len(suggestions)-1):
        res.append([suggestions[i],1/distances[i]*rating])

    return res


In [61]:
#Returns a DF with all movies that a user has rated with 5 stars
def ratedMovies(user):
    movies_watched = user_reviews.iloc[user]
    movies_watched = movies_watched.dropna()
    movies_watched = movies_watched.to_frame()
    movies_watched.columns = ['Avg rating']
    movies_watched = movies_watched[movies_watched['Avg rating'] > 3.0]
    return movies_watched

In [62]:
def Contains(string,list):
    for i in range(0,len(list)-1):
        if list[i][0] == string:
            index = i 
            return True, index
    index = -1
    return False, index

In [63]:
def recommenderSystem(user):
    res = ratedMovies(user)

    lst = []
    for i in range(0,len(res)-1):
        lst.append(i)

    for i in range(0,len(res)-1):
        rating = res.iloc[i][0] 
        movie = res.index[i]
        #print(movie)
        movie_suggestion, distances =  recommender(movie,5) 
        weightedRec = rateWeight(rating, movie_suggestion, distances)
        #print(weightedRec)
        lst[i] = weightedRec

    topList = lst[0]

    for i in range(1,len(lst)-1):
        for j in range(0,len(lst[i])-1):
            boolean, index = Contains(lst[i][j][0],topList)
            if boolean:
                topList[index][1]=topList[index][1]+lst[i][j][1]
            else:
                topList.append(lst[i][j])

    for i in range(0,len(topList)-1):
        for j in range(0,len(res)-1):
            if res.index[j]==topList[i][0]:
                #print(j)
                #print('True')
                topList[i][1]=0
            


    topListDf = pd.DataFrame(topList)
    topListDf = topListDf.sort_values(by=[1], ascending = False)


    return topListDf

# Final method

In [64]:
resultDf1 = recommenderSystem(0)
resultDf1 = recommenderSystem(1)
resultDf2 = recommenderSystem(2)
resultDf3 = recommenderSystem(3)
resultDf4 = recommenderSystem(4)
resultDf5 = recommenderSystem(5)

#resultDf1.head(5)
#resultDf2.head(5)
resultDf3.head(5)
#resultDf4.head(5)
#resultDf5.head(5)


Unnamed: 0,0,1
19,Tarnation,8.251577
7,Back to the Future Part II,8.170939
4,Hot Pursuit,7.693136
28,Point Blank,7.515756
9,August Rush,7.38538


In [66]:
recommenderSystem(2).head(5)

Unnamed: 0,0,1
0,The Big Wedding,81.474382
15,Sorority Boys,10.210549
16,Supporting Characters,8.215444
43,The Outrageous Sophie Tucker,8.1469
12,12 Rounds,7.755961
