In [291]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [292]:
sc = sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

# Data Preprocessing

## Movie

In [293]:
genre = pd.read_csv('Data/u.genre', sep = "|")
genre_list = list(pd.Series(genre['unknown']))


In [294]:
movieInfo = pd.read_csv('Data/u.item', sep = "|", encoding = "iso-8859-1", names=['MovieId','Title','Date','RealeseDate', 'VideoRDate','IMDB']+genre_list)
requiredInfo = movieInfo.drop(['Date', 'RealeseDate','VideoRDate','IMDB'], axis =1)


In [295]:
indexedMovies = requiredInfo.drop(genre_list, axis=1)


In [296]:
columnPurpose = requiredInfo.drop(['MovieId','Title'],axis = 1)

In [297]:
requiredInfo.to_csv("ProcessedData/requiredInfo.csv", index=False)
columnPurpose.to_csv("ProcessedData/movie_genre.csv", index=False)
indexedMovies.to_csv("ProcessedData/movieId_movies.csv", index=False)

## User-Ratings

In [298]:
def movie_rating_combiner(row):
    user, movie, rating, date = row.split("\t")
    return((user,["%s-%s" %(movie,rating)]))

def sortMoviesByRatings(row):
    movies = row[1]
    movies.sort(key = lambda x: -int(x[-1]))
    return (row[0]," ".join(movies[:3]))

In [299]:
ratings = sc.textFile('Data/u.data').map(movie_rating_combiner)
ratings = ratings.reduceByKey(lambda x,y: x+y)


In [300]:
ratings = ratings.map(sortMoviesByRatings)

In [301]:
combinedratings = sqlContext.createDataFrame(ratings)
combinedratings.createOrReplaceTempView("ratings")

combinedratings.coalesce(1).write.csv('ProcessedData/combinedRatings/')
os.system('mv ./ProcessedData/combinedRatings/*.csv ./ProcessedData/combinedRatings/combinedRatings.csv')

0

## Getting the average rating of the movie

In [302]:
def rating_tab_sep(row):
    row = row.split("\t")
    return (row[1],(row[2],1))

def rating_sum(rating_row_1,rating_row_2):
    return(int(rating_row_1[0])+int(rating_row_2[0]),rating_row_1[1]+rating_row_2[1])

def find_avg(row):
    total_sum = int(row[1][0])
    no_of_ratings = int(row[1][1])
    return(row[0],"%.1f"%(total_sum/no_of_ratings))

In [303]:
data_copy = sc.textFile('Data/u.data').map(rating_tab_sep)
data_copy = data_copy.reduceByKey(rating_sum)
avg_rating = data_copy.map(find_avg)

In [304]:
avg_rating = sqlContext.createDataFrame(avg_rating)
avg_rating.createOrReplaceTempView("avg_rating")

avg_rating.coalesce(1).write.csv('ProcessedData/avg_rating/')
os.system('mv ./ProcessedData/avg_rating/*.csv ./ProcessedData/avg_rating/avg_rating.csv')

0

# User Section

In [305]:
users = pd.read_csv('Data/u.user',sep ="|", names = ['UserId','Age','Gender','Occupation','Zip'])

In [306]:
def get_user_index_by_user_id(user_id):
    return int(user_id-1)

def get_user_suggestions_by_user_id(user_id):
    userIndex = get_user_index_by_user_id(user_id)
    suggestion_index = age_occ_suggestion_matrix[userIndex]
    return(suggestion_index)
    

In [307]:
cv = CountVectorizer()

age_count_matrix = 1-euclidean_distances(users['Age'].values.reshape(-1,1))*0.01
age_count_matrix*=0.6

occupation_count_matrix = cv.fit_transform(users['Occupation'])
ocs = cosine_similarity(occupation_count_matrix)
ocs*=0.4

age_occ_matrix = np.add(ocs,age_count_matrix)

age_occ_suggestion_matrix = []

for rows in age_occ_matrix:
    row = list(enumerate(rows))
    row.sort(key= lambda x: -x[1])
    row = sorted(row,key=lambda x: -x[1])[1:6]
    age_occ_suggestion_matrix.append(row)
age_occ_suggestion_matrix = np.array(age_occ_suggestion_matrix)

# Movie Section

In [308]:
movie_genre = pd.read_csv("ProcessedData/movie_genre.csv")
indexedMovies = pd.read_csv("ProcessedData/movieId_movies.csv")


In [309]:
##Getting index based on titles of the movie
def get_movie_id(title):
    return indexedMovies.loc[indexedMovies['Title']==title].values.tolist()[0][0]

def get_movie_title(movieId):
    return indexedMovies.loc[indexedMovies['MovieId'] == movieId].values.tolist()[0][1]

##Getting suggestions by title name
def get_suggestion_by_movie_title(title):
    movie_index = get_movie_id(title)-1
    return get_suggestion_by_movie_index(movie_index)
##Getting suggestions by movie ID
def get_suggestion_by_movie_id(movieId):
    movieId = int(movieId)
    return get_suggestion_by_movie_index(movieId-1)

##Getting Titles based on the indexes from the suggestions 
def get_suggestion_by_movie_index(index):
    row = enumerated[index]
    returnVal = []
    for movies in row:
        returnVal.append(indexedMovies.loc[indexedMovies['MovieId'] == movies[0]+1].values.tolist()[0])
    return (returnVal)    

#Getting Average rating of a movie
def get_avg_rating_by_movie_id(movieId):
    avg_movie_rating = average_rating.loc[average_rating['MovieId'] == movieId, ['Avg_Rating']].values[0][0]
    return int(avg_movie_rating)

## User's top 3 rated movies

In [310]:
average_rating = pd.read_csv('ProcessedData/avg_rating/avg_rating.csv', names = ['MovieId','Avg_Rating'])
user_top_rated_movies = pd.read_csv('ProcessedData/combinedRatings/combinedRatings.csv', names = ['UserId','Top Three Rated Movies'])

In [311]:
# Getting movie recommendation from the userId from user's top 3 ratings
def get_movies_based_on_previous_rating(userId):
    userRatings = user_top_rated_movies.loc[user_top_rated_movies['UserId'] == userId, 'Top Three Rated Movies'].tolist()[0]
    movies = userRatings.split(' ')
    returnList = []
    for movie in movies:
        movieId,ratings = movie.strip().split('-')
        returnList+=get_suggestion_by_movie_id(movieId)
    return returnList

def get_movies_based_on_previous_rating_with_weight(userId):
    suggested_movie_list = get_movies_based_on_previous_rating(userId)
    previous_ratings_value = user_top_rated_movies.loc[user_top_rated_movies['UserId'] == userId, 'Top Three Rated Movies'].tolist()[0]
    previous_movies = previous_ratings_value.split(' ')
    
    returnList = []
    
    for i in range(len(previous_movies)):
        previous_movie, previous_movie_rating = previous_movies[i].split('-')
        
        for s_movie_index in range(i*5,(i*5)+5):
            m_id = suggested_movie_list[s_movie_index][0]
            m_id_avg_rating = get_avg_rating_by_movie_id(m_id)
            returnList.append((m_id,((int(previous_movie_rating)*m_id_avg_rating)/5)*0.6))        
    return returnList

## Finding Cosine Similarities between movies

In [318]:
columnValues = movie_genre.values
cs = cosine_similarity(columnValues)

In [313]:
## Suggestion list for each movie by their indexes
enumerated = []

for index in cs:
    row = list(enumerate(index))
    row.sort(key = lambda x: -x[1])
    row = list(filter(lambda x: x[1]>0 , row))[1:6]
    enumerated.append(row)

enumerated = np.array(enumerated)

# Getting suggestions based on the recommended users


In [314]:
def movie_from_suggested_users(userId):
    originalUserId = userId
    suggestedUsers = get_user_suggestions_by_user_id(originalUserId)
    suggested_movie_list = []
    
    for user in suggestedUsers:
        su_Id = user[0]
        su_weight = int(user[1])
        
        su_suggested_movies = user_top_rated_movies.loc[user_top_rated_movies['UserId'] == su_Id, ['Top Three Rated Movies']].values.tolist()[0][0]
        for mRatings in su_suggested_movies.split(" "):
            mov_id, mov_r = mRatings.split("-")
            mov_id = int(mov_id)
            avg_movie_rating = get_avg_rating_by_movie_id(mov_id)
            
            suggested_movie_list.append((mov_id,(((int(mov_r)*su_weight*avg_movie_rating)/5)*0.4)))
    suggested_movie_list.sort(key = lambda x : -x[1])
    return (suggested_movie_list)

# Get Recommendation

In [315]:
def get_recommendation(userId):
    movies_by_movies = get_movies_based_on_previous_rating_with_weight(userId)
    movies_by_user = movie_from_suggested_users(userId)
    
    movie_id_list = movies_by_movies+movies_by_user
    movie_id_list.sort(key = lambda x: -x[1])
    
    movie_name_list = []
    for movie in movie_id_list:
        
        movie_id = movie[0]
        movie_weight = movie[1]
        
        movie_name_list.append((get_movie_title(movie_id),movie_weight))
        
    return movie_name_list

In [319]:
# get_recommendation(user_id)