In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
import warnings
warnings.filterwarnings('ignore')

u_cols=['user id',"age","sex","occupation","zip_score"]
users=pd.read_csv("u.user",sep='|',names=u_cols)

r_cols =['user id',"movie id","rating","timestamp"]
ratings=pd.read_csv("u.data",sep='\t',names=r_cols)

i_cols = ['movie id','movie title','release date','video release date','IMDb URL','Unknown','action','adventure','Animation',
          'children','comedy','crime','documentrty','drama','fantasy','film-noir','horror','musical','mystery','Roamance',
          'Sci-Fi','Triller','war','western']
movies_title= pd.read_csv("u.item", sep="|",header= None,names = i_cols, encoding = 'latin-1')

df=pd.merge(ratings,movies_title,on="movie id")

In [None]:
ratings1 = pd.DataFrame(df.groupby('user id').count()['movie id'])

plt.hist(ratings1['movie id'], bins=70)
plt.xlabel('No of users')
plt.ylabel('no of movies rated')
plt.show()

In [None]:
ratings2= pd.DataFrame(df.groupby('movie title').count()['rating'])

plt.hist(ratings2['rating'],bins=70)
plt.xlabel('movie')
plt.ylabel('no of ratings')
plt.show()

In [None]:
#To calculate count the average number of nullatings for all users
#df2 = df.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count')
null_rat = df.groupby('user id').agg({'rating': lambda x: x.isnull().count()})
null_rat['rating'].mean()

In [None]:
#User-user & item-item based recommendations

import numpy as np 
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import math
import numpy as np
import json

In [None]:
u_columns = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv('u.user', sep='|', names = u_columns, encoding='latin-1',skiprows=1)

r_columns = ['user_id','movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_columns,encoding='latin-1',skiprows=1)

i_columns = ['movie_id','movie_title','release_date','video_release_date','IMDb_url','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
items = pd.read_csv('u.item', sep='|', names=i_columns,encoding='latin-1',skiprows=1)

dd = defaultdict(list)

usersList = users.to_dict('records',into=dd)
usersList.append({'user_id':1})
ratingsList = ratings.to_dict('records',into=dd)
itemsList = items.to_dict('records',into=dd)

In [None]:
user_dict = defaultdict(lambda: 0)
movie_dict = defaultdict(lambda: 0)

for i,cell in enumerate(ratingsList):
    user_dict[cell['user_id']] +=1
    movie_dict[cell['movie_id']] +=1
    
def return_len(item):
    return item[1]

movielist = sorted(movie_dict.items(), key=return_len)    
userlist = sorted(user_dict.items(), key=return_len)


movie_distribution = [item for item in map(lambda x: x[1], movielist)]
user_distribution = [item for item in map(lambda x: x[1], userlist)]


In [None]:
# User-User Collaborative Filtering
def create_userMovieRatingTable(usersList,ratingsList):
    userMovieTable = defaultdict(lambda: defaultdict())

    for j, rating in enumerate(ratingsList):
        userMovieTable[rating['user_id']][rating['movie_id']] = rating['rating']
        
    return userMovieTable, i+1 ,j+1

def get_min_max_movies(userMovieDict):
    userMinMaxMovieDict = defaultdict(lambda: defaultdict(list))
    
    for user, movieList in userMovieDict.items():
        if(len(movieList) > 0):
            userMinMaxMovieDict[user]['min'] = min(movieList.keys())
            userMinMaxMovieDict[user]['max'] = max(movieList.keys())
        else:
            userMinMaxMovieDict[user]['min'] = 0
            userMinMaxMovieDict[user]['max'] = 1642
    
    return userMinMaxMovieDict

def createSimilarityMatrix(userMovieDict,similarityMoviesValues):
    userSimilarityDict = defaultdict(lambda: defaultdict())
    
    for user1,movies_rated_user1 in userMovieDict.items():
        for user2,movies_rated_user2 in userMovieDict.items():
            if((len(similarityMoviesValues[user1][user2]['ratings_user1']) > 1) and (user1 !=user2)):
                userSimilarityDict[user1][user2] = pearson_coefficients(similarityMoviesValues[user1][user2]['ratings_user1'],similarityMoviesValues[user1][user2]['ratings_user2'],similarityMoviesValues[user1][user2]['mean_ratings_user1'],similarityMoviesValues[user1][user2]['mean_ratings_user2'])
    
    return userSimilarityDict
    
def similar_movies_recommended(userMovieDict):
    similarMoviesDict = defaultdict(lambda: defaultdict(list))
    
    for user1,movies_rated_user1 in userMovieDict.items():
        for user2,movies_rated_user2 in userMovieDict.items():
            for movie_user1 in movies_rated_user1.keys():
                if(movie_user1 in movies_rated_user2.keys()):
                    similarMoviesDict[user1][user2].append(movie_user1)
    
    return similarMoviesDict

def get_average_user_rating(userMovieRating):
    averageUserRating = defaultdict()
    
    for user1,movies_rated_user1 in userMovieRating.items():
        val_list = movies_rated_user1.values()
        averageUserRating[user1] = np.mean([item for item in val_list])
    
    return averageUserRating

def generate_similarity_matrix(similarMoviesDict,userMovieRating,average_user_rating):
    similarMoviesItemsDict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    
    for user1,similar_user_list in similarMoviesDict.items():
        for user2, common_movies in similar_user_list.items():
            for movie in common_movies:
                similarMoviesItemsDict[user1][user2]['ratings_user1'].append(userMovieRating[user1][movie])
                similarMoviesItemsDict[user1][user2]['ratings_user2'].append(userMovieRating[user2][movie])
            
            similarMoviesItemsDict[user1][user2]['mean_ratings_user1'] = average_user_rating[user1]
            similarMoviesItemsDict[user1][user2]['mean_ratings_user2'] = average_user_rating[user2]
            
            
    return similarMoviesItemsDict
    
def pearson_coefficients(val_user1,val_user2,mean_user1,mean_user2):
    numerator_diff_user1 = 0
    numerator_diff_user2 = 0
    numerator_prod = 0
    
    denominator_prod1 = 0
    denominator_prod2 = 0
    
    for i, valUser1 in enumerate(val_user1):
        numerator_diff_user1 = valUser1 - mean_user1
        numerator_diff_user2 = val_user2[i] - mean_user2
        numerator_prod += (numerator_diff_user1 * numerator_diff_user2)
        
        denominator_prod1 += (math.pow(numerator_diff_user1,2))
        denominator_prod2 += (math.pow(numerator_diff_user2,2))
    
    sqrt_denominator = math.sqrt(denominator_prod1*denominator_prod2)
    
    if(sqrt_denominator !=0):
        weight = numerator_prod/sqrt_denominator
    else:
        weight = 0
    
    return weight

def sorting_dict_by_lengthVal(clusterInfo):
    sorted_dict = sorted(clusterInfo.items(), key=comp_val, reverse=True)
    return sorted_dict

def comp_val(item):
    return item[1]

def sortDictDict(dictdict):
    for entity1,list_other_entities in dictdict.items():
        dictdict[entity1] = sorting_dict_by_lengthVal(list_other_entities)
    
    return dictdict

def get_topTen(sortedDict):
    for entity1,list_entities in sortedDict.items():
        sortedDict[entity1] = list_entities[0:10]
        
    return sortedDict

def get_topTenMovies(sortedDict,watchedMovieDict,userMinMaxDict,filterFlag=False):
    topTenDict = defaultdict(list)
    
    for entity1,list_entities in sortedDict.items():
        if (len(list_entities)>0):
            for movie in list_entities:
                if ((movie[0] not in watchedMovieDict[entity1])):
                    if (filterFlag and (movie[0] >= userMinMaxDict[entity1]['min'] and movie[0] <= userMinMaxDict[entity1]['max'])):
                        topTenDict[entity1].append(movie)
                    elif (filterFlag == False):
                        topTenDict[entity1].append(movie)
                topTenDict[entity1] = topTenDict[entity1][0:10]
    return topTenDict

def get_estimated_movie_rating(neighborhoodDict,average_user_rating,userMovieRating,usersList,itemsList):
    estimatedMovieTable = defaultdict(lambda: defaultdict())
    
    for user1 in usersList:
        for movie in itemsList:
            sum_numerator = 0
            sum_denominator = 0
            for neighborUser in neighborhoodDict[user1['user_id']]:
                if(movie['movie_id'] in userMovieRating[neighborUser[0]].keys()):
                    sum_numerator += neighborUser[1] * (userMovieRating[neighborUser[0]][movie['movie_id']] - average_user_rating[neighborUser[0]])
                    sum_denominator += neighborUser[1]
            
            if(sum_denominator !=0 ):
                estimatedMovieTable[user1['user_id']][movie['movie_id']] = average_user_rating[user1['user_id']] + (sum_numerator/sum_denominator)
            else:
                estimatedMovieTable[user1['user_id']][movie['movie_id']] = 0
    
    return estimatedMovieTable


# Item-Item Collaborative Filtering
def create_movieUserRatingTable(usersList,ratingsList):
    movieUserTable = defaultdict(lambda: defaultdict())

    for j, rating in enumerate(ratingsList):
        movieUserTable[rating['movie_id']][rating['user_id']] = rating['rating']
        
    return movieUserTable, i+1 ,j+1
    
def similar_users_recommended(movieUserDict):
    similarUsersDict = defaultdict(lambda: defaultdict(list))
    
    for movie1,users_rated_movie1 in movieUserDict.items():
        for movie2,users_rated_movie2 in movieUserDict.items():
            for user_movie1 in users_rated_movie1.keys():
                if(user_movie1 in users_rated_movie2.keys()):
                    similarUsersDict[movie1][movie2].append(user_movie1)
    
    return similarUsersDict

def get_average_movie_rating(movieUserRating):
    averageMovieRating = defaultdict()
    
    for movie1,users_rated_movie1 in movieUserRating.items():
        val_list = users_rated_movie1.values()
        if(len(val_list) == 0):
            print(movie1)
        else:
            averageMovieRating[movie1] = np.mean([item for item in val_list])
    
    return averageMovieRating

def already_watched_movies(userMovieRating):
    alreadyWatchedMovieDict = defaultdict(list)
    
    for userId,movie_list in userMovieRating.items():
#         alreadyWatchedMovieDict[userId] = movie_list.keys()
        alreadyWatchedMovieDict[userId] = []
        
    return alreadyWatchedMovieDict
    

def generate_similarity_movie_matrix(similarUsersDict,movieUserRating,average_movie_rating):
    similarUsersItemsDict = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
            
    for movie1,similar_movie_list in similarUsersDict.items():
        for movie2, common_users in similar_movie_list.items():
            for user in common_users:
                similarUsersItemsDict[movie1][movie2]['ratings_movie1'].append(movieUserRating[movie1][user])
                similarUsersItemsDict[movie1][movie2]['ratings_movie2'].append(movieUserRating[movie2][user])
            
            similarUsersItemsDict[movie1][movie2]['mean_ratings_movie1'] = average_movie_rating[movie1]
            similarUsersItemsDict[movie1][movie2]['mean_ratings_movie2'] = average_movie_rating[movie2]
            
            
    return similarUsersItemsDict

def createSimilarityMovieMatrix(movieUserDict,similarityUserValues):
    movieSimilarityDict = defaultdict(lambda: defaultdict())
    
    for movie1,users_rated_movie1 in movieUserDict.items():
        for movie2,users_rated_movie2 in movieUserDict.items():
            if((len(similarityUserValues[movie1][movie2]['ratings_movie1']) > 1) and (movie1 !=movie2)):
                movieSimilarityDict[movie1][movie2] = pearson_coefficients(similarityUserValues[movie1][movie2]['ratings_movie1'],similarityUserValues[movie1][movie2]['ratings_movie2'],similarityUserValues[movie1][movie2]['mean_ratings_movie1'],similarityUserValues[movie1][movie2]['mean_ratings_movie2'])

    return movieSimilarityDict

def get_estimated_movie_ratings_item_item(neighborhoodMovieDict,average_movie_rating,movieUserRating,usersList,itemsList):
    estimatedMovieTable = defaultdict(lambda: defaultdict())
    
    for movie1 in itemsList:   
        for user in usersList:
            sum_numerator = 0
            sum_denominator = 0
            for neighborMovie in neighborhoodMovieDict[movie1['movie_id']]:
                if(user['user_id'] in movieUserRating[neighborMovie[0]].keys()):
                    sum_numerator += neighborMovie[1] * (movieUserRating[neighborMovie[0]][user['user_id']])
                    sum_denominator += neighborMovie[1]
            
            if(sum_denominator !=0 ):
                estimatedMovieTable[movie1['movie_id']][user['user_id']] = (sum_numerator/sum_denominator)
            else:
                estimatedMovieTable[movie1['movie_id']][user['user_id']] = 0
    
    return estimatedMovieTable

def convert_movieTable_UserTable(movieUserTable):
    estimatedUserTable = defaultdict(lambda: defaultdict())
    
    for movie, user_list in movieUserTable.items():
        for user,rating in user_list.items():
            estimatedUserTable[user][movie] = rating
                               
    return estimatedUserTable

In [None]:
# for user-user collaborative filtering
userMovieRating, totalUsers, totalMovies = create_userMovieRatingTable(usersList,ratingsList)
userMinMaxDict = get_min_max_movies(userMovieRating)
alreadyWatchedMovies = already_watched_movies(userMovieRating)
similarMoviesDict = similar_movies_recommended(userMovieRating)
average_user_rating = get_average_user_rating(userMovieRating)
similarityMoviesValues = generate_similarity_matrix(similarMoviesDict,userMovieRating,average_user_rating)
similarityMatrix = createSimilarityMatrix(userMovieRating,similarityMoviesValues)
sortedSimilarityDict = sortDictDict(similarityMatrix)
neighborhoodDict = get_topTen(sortedSimilarityDict)
estimatedMovieRatings = get_estimated_movie_rating(neighborhoodDict,average_user_rating,userMovieRating,usersList,itemsList)
sortedEstimatedMovieRatings = sortDictDict(estimatedMovieRatings)

# for item-item collaborative filtering
movieUserRating, totalUsers, totalMovies = create_movieUserRatingTable(usersList,ratingsList)
similarUsersDict = similar_users_recommended(movieUserRating)
average_movie_rating = get_average_movie_rating(movieUserRating)
similarityUserValues = generate_similarity_movie_matrix(similarUsersDict,movieUserRating,average_movie_rating)
similarityMovieMatrix = createSimilarityMovieMatrix(movieUserRating,similarityUserValues)
sortedSimilarityMovieDict = sortDictDict(similarityMovieMatrix)
neighborhoodMovieDict = get_topTen(sortedSimilarityMovieDict)
estimatedMovieRatingsItemItem = get_estimated_movie_ratings_item_item(neighborhoodMovieDict,average_movie_rating,movieUserRating,usersList,itemsList)
estimatedUserMovieRatingsItemItem = convert_movieTable_UserTable(estimatedMovieRatingsItemItem)
sortedEstimatedMovieRatingsItemItem = sortDictDict(estimatedUserMovieRatingsItemItem)

In [None]:
def save_dict(fileName,generated_dict):
    with open(fileName, 'w') as file:
        file.write(json.dumps(generated_dict))
        
topTenRecommendedUserUser = get_topTenMovies(sortedEstimatedMovieRatings,alreadyWatchedMovies,userMinMaxDict)
save_dict('pred_user_user.txt',topTenRecommendedUserUser)

topTenRecommendedItemItem = get_topTenMovies(sortedEstimatedMovieRatingsItemItem,alreadyWatchedMovies,userMinMaxDict)
save_dict('pred_item_item.txt',topTenRecommendedItemItem)

In [None]:

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
#Content based recommendation
df_credits = pd.read_csv("tmdb_5000_credits.csv")

df_movies = pd.read_csv("tmdb_5000_movies.csv")

In [None]:
df_final = pd.merge(df_movies,df_credits,left_on='id',right_on='movie_id')[['id','original_title','genres','keywords','overview','cast','crew']]

In [None]:
import ast
L1=[]
for i in ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'):
    L1.append(i['name'])
#print(L1)

def convert(text):
    L=[]
    for i in ast.literal_eval(text):
        L.append(i['name'])
        
    L1=L[:]
    L.clear()
    return L1

df_final['genres']=df_final['genres'].apply(convert)
df_final['keywords']=df_final['keywords'].apply(convert)
df_final['overview']=df_final['overview'].apply(lambda x : x.split(' '))

In [None]:
def fetch_cast(text):
    L=[]
    counter=0
    for i in ast.literal_eval(text):
            if counter<3:
                L.append(i['name'])
            else:
                break
            counter+=1
    L1=L[:]
    L.clear()
    return L1

df_final['cast']=df_final['cast'].apply(fetch_cast)


def fetch_director(text):
    L=[]
    for i in ast.literal_eval(text):
        if i['job']=='Director':
            L.append(i['name'])
            break
    L1=L[:]
    L.clear()
    return L1
    
df_final['crew']=df_final['crew'].apply(fetch_director)


df_final['tags']=df_final['genres']+df_final['keywords']+df_final['overview']+df_final['cast']+df_final['crew']

def remove_space(L):
    L1=[]
    for i in L:
        L1.append(i.replace(' ',''))
    
    L2=L1[:] 
    L1.clear()
    return L2

df_final['tags']=df_final['tags'].apply(remove_space)
df_final['tags']=df_final['tags'].apply(lambda x:" ".join(x))
df_final['tags']=df_final['tags'].apply(lambda x:x.lower())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv=CountVectorizer(stop_words='english',max_features=10000)
import pickle
pickle.dump(cv, open('cv.pkl','wb'))

X = cv.fit_transform(df_final['tags']).toarray()
similarity_matrix = cosine_similarity(X)
pickle.dump(similarity_matrix, open('similarity_matrix.pkl','wb'))

# finding the index of the movie
df_final[df_final['original_title']=='Osama'].index[0]

In [None]:
m = sorted(list(enumerate(similarity_matrix[0])),reverse=True,key=lambda x:x[1])

In [None]:
def recommend(movie_name):
    movie_index=df_final[df_final['original_title']== movie_name].index[0]
    
    L=sorted(list(enumerate(similarity_matrix[movie_index])),reverse=True,key=lambda x:x[1])
    
    for i in L[1:6]:
        print(df_final.iloc[i[0]]['original_title'])

In [None]:
val = input("Enter name of movie: ") 
recommend(val)

In [None]:
val = input("Enter name of movie: ") 
recommend(val)