In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [9]:
sc = sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

# Data Preprocessing

## Movie

In [21]:
genre = pd.read_csv('Data/u.genre', sep = "|")
genre.head(20)

genre_list = list(pd.Series(genre['unknown']))

In [22]:
movieInfo = pd.read_csv('Data/u.item', sep = "|", encoding = "iso-8859-1", names=['MovieId','Title','Date','RealeseDate', 'VideoRDate','IMDB']+genre_list)
requiredInfo = movieInfo.drop(['Date', 'RealeseDate','VideoRDate','IMDB'], axis =1)
requiredInfo.head(5)

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [23]:
indexedMovies = requiredInfo.drop(genre_list, axis=1)
indexedMovies.head(5)

Unnamed: 0,MovieId,Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [24]:
columnPurpose = requiredInfo.drop(['MovieId','Title'],axis = 1)
columnPurpose.head(5)

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [25]:
requiredInfo.to_csv("ProcessedData/requiredInfo.csv", index=False)
columnPurpose.to_csv("ProcessedData/movie_genre.csv", index=False)
indexedMovies.to_csv("ProcessedData/movieId_movies.csv", index=False)

## User-Ratings

In [26]:
def mapper1(row):
    user, movie, rating, date = row.split("\t")
    return((user,["%s-%s" %(movie,rating)]))

def sortMoviesByRatings(row):
    movies = row[1]
    movies.sort(key = lambda x: -int(x[-1]))
    return (row[0]," ".join(movies[:3]))

In [27]:
ratings = sc.textFile('Data/u.data').map(mapper1)
ratings = ratings.reduceByKey(lambda x,y: x+y)


In [28]:
ratings = ratings.map(sortMoviesByRatings)
ratings.collect()

[('22', '128-5 258-5 510-5'),
 ('244', '154-5 89-5 652-5'),
 ('115', '8-5 127-5 234-5'),
 ('305', '427-5 483-5 50-5'),
 ('286', '1014-5 379-5 288-5'),
 ('303', '69-5 134-5 161-5'),
 ('122', '387-5 715-5 708-5'),
 ('234', '705-5 134-5 519-5'),
 ('119', '1153-5 237-5 222-5'),
 ('167', '1306-5 1126-5 133-5'),
 ('299', '127-5 216-5 462-5'),
 ('102', '195-4 307-4 89-4'),
 ('63', '100-5 1007-5 301-5'),
 ('160', '234-5 174-5 160-5'),
 ('50', '253-5 475-5 1084-5'),
 ('301', '79-5 202-5 174-5'),
 ('290', '143-5 50-5 71-5'),
 ('157', '150-5 127-5 273-5'),
 ('278', '603-5 525-5 22-5'),
 ('10', '611-5 100-5 488-5'),
 ('284', '301-5 347-5 272-5'),
 ('246', '201-5 425-5 68-5'),
 ('249', '241-5 746-5 11-5'),
 ('20', '87-5 148-5 496-5'),
 ('138', '26-5 523-5 483-5'),
 ('60', '427-5 60-5 430-5'),
 ('57', '304-5 79-5 744-5'),
 ('223', '969-5 237-5 216-5'),
 ('189', '520-5 1060-5 56-5'),
 ('243', '221-5 582-5 511-5'),
 ('241', '750-5 880-5 288-5'),
 ('222', '750-5 173-5 53-5'),
 ('8', '22-5 50-5 182-5'),

In [29]:
combinedratings = sqlContext.createDataFrame(ratings)
combinedratings.createOrReplaceTempView("ratings")

combinedratings.coalesce(1).write.csv('ProcessedData/combinedRatings/')
os.system('mv ./ProcessedData/combinedRatings/*.csv ./ProcessedData/combinedRatings/combinedRatings.csv')

0

# User Section

In [32]:
users = pd.read_csv('Data/u.user',sep ="|", names = ['UserId','Age','Gender','Occupation','Zip'])
users.head(5)

Unnamed: 0,UserId,Age,Gender,Occupation,Zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [33]:
def get_user_index_by_user_id(user_id):
    return int(user_id-1)

def get_user_suggestions_by_user_id(user_id):
    userIndex = get_user_index_by_user_id(user_id)
    suggestion_index = age_occ_suggestion_matrix[userIndex]
    print(suggestion_index)
    

In [34]:
cv = CountVectorizer()

age_count_matrix = 1-euclidean_distances(users['Age'].values.reshape(-1,1))*0.01
age_count_matrix*=0.6

occupation_count_matrix = cv.fit_transform(users['Occupation'])
ocs = cosine_similarity(occupation_count_matrix)
ocs*=0.4

age_occ_matrix = np.add(ocs,age_count_matrix)

age_occ_suggestion_matrix = []
for rows in age_occ_matrix:
    row = list(enumerate(rows))
    row.sort(key= lambda x: -x[1])
    row = sorted(row,key=lambda x: -x[1])[1:6]
    age_occ_suggestion_matrix.append(row)
age_occ_suggestion_matrix = np.array(age_occ_suggestion_matrix)

In [37]:
get_user_suggestions_by_user_id(1)

[[  3.   1.]
 [455.   1.]
 [716.   1.]
 [831.   1.]
 [888.   1.]]


# Movie Section

In [41]:
movie_genre = pd.read_csv("ProcessedData/movie_genre.csv")
indexedMovies = pd.read_csv("ProcessedData/movieId_movies.csv")

In [42]:
##Getting index based on titles of the movie
def get_movie_id(title):
    return indexedMovies.loc[indexedMovies['Title']==title].values.tolist()[0][0]

##Getting suggestions by title name
def get_suggestion_by_movie_title(title):
    movie_index = get_movie_id(title)-1
    return get_suggestion_by_movie_index(movie_index)
##Getting suggestions by movie ID
def get_suggestion_by_movie_id(movieId):
    movieId = int(movieId)
    return get_suggestion_by_movie_index(movieId-1)

##Getting Titles based on the indexes from the suggestions 
def get_suggestion_by_movie_index(index):
    row = enumerated[index]
    returnVal = []
    print("Given Movie", indexedMovies.loc[indexedMovies['MovieId'] == index+1].values.tolist()[0])
    for movies in row:
        returnVal.append(indexedMovies.loc[indexedMovies['MovieId'] == movies[0]+1].values.tolist()[0])
    return (returnVal)         

## User's top 3 rated movies

In [55]:
# Getting movie recommendation from the userId from user's top 3 ratings
def get_movies_based_on_previous_rating(userId):
    userRatings = combinedRatings.loc[combinedRatings['UserId'] == userId, 'Ratings'].tolist()[0]
    movies = userRatings.split(' ')
    returnList = []
    for movie in movies:
        movieId,ratings = movie.strip().split('-')
        returnList+=get_suggestion_by_movie_id(movieId)
    return returnList

In [56]:
combinedRatings = pd.read_csv('processedData/combinedRatings/combinedRatings.csv', names = ['UserId','Ratings'])
combinedRatings.head(5)

Unnamed: 0,UserId,Ratings
0,22,128-5 258-5 510-5
1,244,154-5 89-5 652-5
2,115,8-5 127-5 234-5
3,305,427-5 483-5 50-5
4,286,1014-5 379-5 288-5


## Finding Cosine Similarities between movies

In [57]:
columnValues = movie_genre.values
cs = cosine_similarity(columnValues)
cs

array([[1.        , 0.        , 0.        , ..., 0.        , 0.57735027,
        0.        ],
       [0.        , 1.        , 0.57735027, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.57735027, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.70710678],
       [0.57735027, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.70710678, 0.        ,
        1.        ]])

In [58]:
## Suggestion list for each movie by their indexes
enumerated = []

for index in cs:
    row = list(enumerate(index))
    row.sort(key = lambda x: -x[1])
    row = list(filter(lambda x: x[1]>0 , row))[1:6]
    enumerated.append(row)

enumerated = np.array(enumerated)

## Get Suggestions

In [59]:
suggestions = get_suggestion_by_movie_index(0)
for movies in suggestions:
    print(movies)

Given Movie [1, 'Toy Story (1995)']
[422, 'Aladdin and the King of Thieves (1996)']
[95, 'Aladdin (1992)']
[1219, 'Goofy Movie, A (1995)']
[63, 'Santa Clause, The (1994)']
[94, 'Home Alone (1990)']


In [60]:
get_movies_based_on_previous_rating(22)

Given Movie [128, 'Supercop (1992)']
Given Movie [258, 'Contact (1997)']
Given Movie [510, 'Magnificent Seven, The (1954)']


[[79, 'Fugitive, The (1993)'],
 [128, 'Supercop (1992)'],
 [144, 'Die Hard (1988)'],
 [147, 'Long Kiss Goodnight, The (1996)'],
 [226, 'Die Hard 2 (1990)'],
 [258, 'Contact (1997)'],
 [429, 'Day the Earth Stood Still, The (1951)'],
 [1006, 'Until the End of the World (Bis ans Ende der Welt) (1991)'],
 [239, 'Sneakers (1992)'],
 [270, 'Gattaca (1997)'],
 [840, 'Last Man Standing (1996)'],
 [177, 'Good, The Bad and The Ugly, The (1966)'],
 [339, 'Mad City (1997)'],
 [686, 'Perfect World, A (1993)'],
 [808, 'Program, The (1993)']]

In [61]:
get_movie_id("Aladdin and the King of Thieves (1996)")

422

In [62]:
get_suggestion_by_movie_title('Toy Story (1995)')

Given Movie [1, 'Toy Story (1995)']


[[422, 'Aladdin and the King of Thieves (1996)'],
 [95, 'Aladdin (1992)'],
 [1219, 'Goofy Movie, A (1995)'],
 [63, 'Santa Clause, The (1994)'],
 [94, 'Home Alone (1990)']]