In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from pyspark import SparkContext
from pyspark.sql import SQLContext


In [9]:
sc = sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

# Data Preprocessing

## Movie

In [64]:
genre = pd.read_csv('Data/u.genre', sep = "|")
genre_list = list(pd.Series(genre['unknown']))
genre.head(20)

Unnamed: 0,unknown,0
0,Action,1
1,Adventure,2
2,Animation,3
3,Children's,4
4,Comedy,5
5,Crime,6
6,Documentary,7
7,Drama,8
8,Fantasy,9
9,Film-Noir,10


In [22]:
movieInfo = pd.read_csv('Data/u.item', sep = "|", encoding = "iso-8859-1", names=['MovieId','Title','Date','RealeseDate', 'VideoRDate','IMDB']+genre_list)
requiredInfo = movieInfo.drop(['Date', 'RealeseDate','VideoRDate','IMDB'], axis =1)
requiredInfo.head(5)

Unnamed: 0,MovieId,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [23]:
indexedMovies = requiredInfo.drop(genre_list, axis=1)
indexedMovies.head(5)

Unnamed: 0,MovieId,Title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [24]:
columnPurpose = requiredInfo.drop(['MovieId','Title'],axis = 1)
columnPurpose.head(5)

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [25]:
requiredInfo.to_csv("ProcessedData/requiredInfo.csv", index=False)
columnPurpose.to_csv("ProcessedData/movie_genre.csv", index=False)
indexedMovies.to_csv("ProcessedData/movieId_movies.csv", index=False)

## User-Ratings

In [69]:
def movie_rating_combiner(row):
    user, movie, rating, date = row.split("\t")
    return((user,["%s-%s" %(movie,rating)]))

def sortMoviesByRatings(row):
    movies = row[1]
    movies.sort(key = lambda x: -int(x[-1]))
    return (row[0]," ".join(movies[:3]))

In [70]:
ratings = sc.textFile('Data/u.data').map(movie_rating_combiner)
ratings = ratings.reduceByKey(lambda x,y: x+y)


In [71]:
ratings = ratings.map(sortMoviesByRatings)
ratings.collect()

[('22', '128-5 258-5 510-5'),
 ('244', '154-5 89-5 652-5'),
 ('115', '8-5 127-5 234-5'),
 ('305', '427-5 483-5 50-5'),
 ('286', '1014-5 379-5 288-5'),
 ('303', '69-5 134-5 161-5'),
 ('122', '387-5 715-5 708-5'),
 ('234', '705-5 134-5 519-5'),
 ('119', '1153-5 237-5 222-5'),
 ('167', '1306-5 1126-5 133-5'),
 ('299', '127-5 216-5 462-5'),
 ('102', '195-4 307-4 89-4'),
 ('63', '100-5 1007-5 301-5'),
 ('160', '234-5 174-5 160-5'),
 ('50', '253-5 475-5 1084-5'),
 ('301', '79-5 202-5 174-5'),
 ('290', '143-5 50-5 71-5'),
 ('157', '150-5 127-5 273-5'),
 ('278', '603-5 525-5 22-5'),
 ('10', '611-5 100-5 488-5'),
 ('284', '301-5 347-5 272-5'),
 ('246', '201-5 425-5 68-5'),
 ('249', '241-5 746-5 11-5'),
 ('20', '87-5 148-5 496-5'),
 ('138', '26-5 523-5 483-5'),
 ('60', '427-5 60-5 430-5'),
 ('57', '304-5 79-5 744-5'),
 ('223', '969-5 237-5 216-5'),
 ('189', '520-5 1060-5 56-5'),
 ('243', '221-5 582-5 511-5'),
 ('241', '750-5 880-5 288-5'),
 ('222', '750-5 173-5 53-5'),
 ('8', '22-5 50-5 182-5'),

In [29]:
combinedratings = sqlContext.createDataFrame(ratings)
combinedratings.createOrReplaceTempView("ratings")

combinedratings.coalesce(1).write.csv('ProcessedData/combinedRatings/')
os.system('mv ./ProcessedData/combinedRatings/*.csv ./ProcessedData/combinedRatings/combinedRatings.csv')

0

## Getting the average rating of the movie

In [108]:
def rating_tab_sep(row):
    row = row.split("\t")
    return (row[1],(row[2],1))

def rating_sum(rating_row_1,rating_row_2):
    return(int(rating_row_1[0])+int(rating_row_2[0]),rating_row_1[1]+rating_row_2[1])

def find_avg(row):
    total_sum = int(row[1][0])
    no_of_ratings = int(row[1][1])
    return(row[0],"%.1f"%(total_sum/no_of_ratings))

In [109]:
data_copy = sc.textFile('Data/u.data').map(rating_tab_sep)
data_copy = data_copy.reduceByKey(rating_sum)
avg_rating = data_copy.map(find_avg)
avg_rating.collect()

[('346', '3.6'),
 ('474', '4.3'),
 ('265', '3.9'),
 ('465', '3.6'),
 ('451', '3.3'),
 ('86', '3.9'),
 ('257', '3.7'),
 ('222', '3.7'),
 ('40', '2.9'),
 ('29', '2.7'),
 ('387', '3.4'),
 ('274', '3.5'),
 ('486', '3.8'),
 ('144', '3.9'),
 ('1', '3.9'),
 ('277', '3.5'),
 ('234', '3.8'),
 ('246', '3.9'),
 ('98', '4.3'),
 ('88', '3.5'),
 ('1081', '2.8'),
 ('603', '4.4'),
 ('796', '3.1'),
 ('16', '3.2'),
 ('304', '3.5'),
 ('979', '3.2'),
 ('564', '2.0'),
 ('1137', '4.0'),
 ('241', '3.5'),
 ('4', '3.6'),
 ('100', '4.2'),
 ('679', '3.0'),
 ('143', '3.8'),
 ('515', '4.2'),
 ('20', '3.4'),
 ('219', '3.2'),
 ('526', '3.8'),
 ('26', '3.5'),
 ('512', '4.0'),
 ('1049', '2.5'),
 ('690', '3.5'),
 ('248', '3.7'),
 ('229', '3.1'),
 ('237', '3.7'),
 ('54', '3.2'),
 ('111', '3.5'),
 ('338', '2.7'),
 ('154', '3.7'),
 ('1153', '3.5'),
 ('209', '3.9'),
 ('208', '3.9'),
 ('685', '3.4'),
 ('328', '3.4'),
 ('496', '4.1'),
 ('132', '4.1'),
 ('174', '4.3'),
 ('96', '4.0'),
 ('307', '3.5'),
 ('648', '4.0'),
 ('21',

In [110]:
avg_rating = sqlContext.createDataFrame(avg_rating)
avg_rating.createOrReplaceTempView("avg_rating")

avg_rating.coalesce(1).write.csv('ProcessedData/avg_rating/')
os.system('mv ./ProcessedData/avg_rating/*.csv ./ProcessedData/avg_rating/avg_rating.csv')

0

# User Section

In [32]:
users = pd.read_csv('Data/u.user',sep ="|", names = ['UserId','Age','Gender','Occupation','Zip'])
users.head(5)

Unnamed: 0,UserId,Age,Gender,Occupation,Zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [123]:
def get_user_index_by_user_id(user_id):
    return int(user_id-1)

def get_user_suggestions_by_user_id(user_id):
    userIndex = get_user_index_by_user_id(user_id)
    suggestion_index = age_occ_suggestion_matrix[userIndex]
    return(suggestion_index)
    

In [34]:
cv = CountVectorizer()

age_count_matrix = 1-euclidean_distances(users['Age'].values.reshape(-1,1))*0.01
age_count_matrix*=0.6

occupation_count_matrix = cv.fit_transform(users['Occupation'])
ocs = cosine_similarity(occupation_count_matrix)
ocs*=0.4

age_occ_matrix = np.add(ocs,age_count_matrix)

age_occ_suggestion_matrix = []
for rows in age_occ_matrix:
    row = list(enumerate(rows))
    row.sort(key= lambda x: -x[1])
    row = sorted(row,key=lambda x: -x[1])[1:6]
    age_occ_suggestion_matrix.append(row)
age_occ_suggestion_matrix = np.array(age_occ_suggestion_matrix)

In [122]:
get_user_suggestions_by_user_id(1)

array([[  3.,   1.],
       [455.,   1.],
       [716.,   1.],
       [831.,   1.],
       [888.,   1.]])

In [131]:
# for x in range(len(age_occ_suggestion_matrix)):
#     print(x,age_occ_suggestion_matrix[x][0][0])


0 3.0
1 168.0
2 292.0
3 3.0
4 748.0
5 283.0
6 130.0
7 715.0
8 192.0
9 443.0
10 82.0
11 37.0
12 165.0
13 231.0
14 58.0
15 431.0
16 474.0
17 651.0
18 26.0
19 361.0
20 497.0
21 740.0
22 409.0
23 781.0
24 209.0
25 270.0
26 26.0
27 121.0
28 262.0
29 470.0
30 95.0
31 225.0
32 36.0
33 150.0
34 897.0
35 67.0
36 36.0
37 37.0
38 838.0
39 429.0
40 147.0
41 173.0
42 228.0
43 544.0
44 221.0
45 507.0
46 514.0
47 665.0
48 36.0
49 444.0
50 902.0
51 269.0
52 299.0
53 861.0
54 351.0
55 213.0
56 627.0
57 113.0
58 58.0
59 654.0
60 370.0
61 524.0
62 525.0
63 578.0
64 84.0
65 36.0
66 256.0
67 67.0
68 104.0
69 915.0
70 73.0
71 408.0
72 300.0
73 73.0
74 566.0
75 98.0
76 669.0
77 85.0
78 33.0
79 190.0
80 197.0
81 140.0
82 397.0
83 212.0
84 84.0
85 85.0
86 528.0
87 882.0
88 295.0
89 468.0
90 171.0
91 144.0
92 575.0
93 102.0
94 577.0
95 161.0
96 835.0
97 92.0
98 98.0
99 373.0
100 280.0
101 318.0
102 102.0
103 285.0
104 104.0
105 233.0
106 73.0
107 253.0
108 477.0
109 67.0
110 156.0
111 530.0
112 92.0
113 113.0
1

# Movie Section

In [41]:
movie_genre = pd.read_csv("ProcessedData/movie_genre.csv")
indexedMovies = pd.read_csv("ProcessedData/movieId_movies.csv")

In [42]:
##Getting index based on titles of the movie
def get_movie_id(title):
    return indexedMovies.loc[indexedMovies['Title']==title].values.tolist()[0][0]

##Getting suggestions by title name
def get_suggestion_by_movie_title(title):
    movie_index = get_movie_id(title)-1
    return get_suggestion_by_movie_index(movie_index)
##Getting suggestions by movie ID
def get_suggestion_by_movie_id(movieId):
    movieId = int(movieId)
    return get_suggestion_by_movie_index(movieId-1)

##Getting Titles based on the indexes from the suggestions 
def get_suggestion_by_movie_index(index):
    row = enumerated[index]
    returnVal = []
    print("Given Movie", indexedMovies.loc[indexedMovies['MovieId'] == index+1].values.tolist()[0])
    for movies in row:
        returnVal.append(indexedMovies.loc[indexedMovies['MovieId'] == movies[0]+1].values.tolist()[0])
    return (returnVal)         

## User's top 3 rated movies

In [55]:
# Getting movie recommendation from the userId from user's top 3 ratings
def get_movies_based_on_previous_rating(userId):
    userRatings = combinedRatings.loc[combinedRatings['UserId'] == userId, 'Ratings'].tolist()[0]
    movies = userRatings.split(' ')
    returnList = []
    for movie in movies:
        movieId,ratings = movie.strip().split('-')
        returnList+=get_suggestion_by_movie_id(movieId)
    return returnList

In [56]:
combinedRatings = pd.read_csv('processedData/combinedRatings/combinedRatings.csv', names = ['UserId','Ratings'])
combinedRatings.head(5)

Unnamed: 0,UserId,Ratings
0,22,128-5 258-5 510-5
1,244,154-5 89-5 652-5
2,115,8-5 127-5 234-5
3,305,427-5 483-5 50-5
4,286,1014-5 379-5 288-5


## Finding Cosine Similarities between movies

In [57]:
columnValues = movie_genre.values
cs = cosine_similarity(columnValues)
cs

array([[1.        , 0.        , 0.        , ..., 0.        , 0.57735027,
        0.        ],
       [0.        , 1.        , 0.57735027, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.57735027, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.70710678],
       [0.57735027, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.70710678, 0.        ,
        1.        ]])

In [58]:
## Suggestion list for each movie by their indexes
enumerated = []

for index in cs:
    row = list(enumerate(index))
    row.sort(key = lambda x: -x[1])
    row = list(filter(lambda x: x[1]>0 , row))[1:6]
    enumerated.append(row)

enumerated = np.array(enumerated)

# Getting suggestions based on the recommended users


In [159]:
average_rating = pd.read_csv('ProcessedData/avg_rating/avg_rating.csv', names = ['MovieId','Avg_Rating'])
average_rating.head(5)

user_top_rated_movies = pd.read_csv('ProcessedData/combinedRatings/combinedRatings.csv', names = ['UserId','Top Three Rated Movies'])
user_top_rated_movies.loc[user_top_rated_movies['UserId'] == 3, ['Top Three Rated Movies']].values.tolist()[0][0]

'328-5 321-5 320-5'

In [163]:
def movie_from_suggested_users(userId):
    originalUserId = userId
    suggestedUsers = get_user_suggestions_by_user_id(originalUserId)
    
    print("Suggested Users: ", suggestedUsers)
    suggested_movie_list = []
    
    for user in suggestedUsers:
        su_Id = user[0]
        su_weight = int(user[1])
        
        su_suggested_movies = user_top_rated_movies.loc[user_top_rated_movies['UserId'] == su_Id, ['Top Three Rated Movies']].values.tolist()[0][0]
        print(su_Id,su_suggested_movies)
        for mRatings in su_suggested_movies.split(" "):
            mov_id, mov_r = 
            suggested_movie_list.append((mRatings[0],int(mRatings[1]*su_weight)))
    suggested_movie_list.sort(key = lambda x : -x[1])
    return (suggested_movie_list)

In [164]:
movie_from_suggested_users(1)

Suggested Users:  [[  3.   1.]
 [455.   1.]
 [716.   1.]
 [831.   1.]
 [888.   1.]]
3.0 328-5 321-5 320-5
455.0 135-5 56-5 511-5
716.0 318-5 483-5 91-5
831.0 96-5 181-5 272-5
888.0 269-5 792-5 286-5


[('7', 9),
 ('4', 8),
 ('1', 8),
 ('2', 8),
 ('2', 7),
 ('5', 6),
 ('9', 6),
 ('2', 6),
 ('1', 3),
 ('3', 2),
 ('3', 2),
 ('3', 2),
 ('5', 1),
 ('3', 1),
 ('9', 1)]

## Get Suggestions -- Testing

In [59]:
suggestions = get_suggestion_by_movie_index(0)
for movies in suggestions:
    print(movies)

Given Movie [1, 'Toy Story (1995)']
[422, 'Aladdin and the King of Thieves (1996)']
[95, 'Aladdin (1992)']
[1219, 'Goofy Movie, A (1995)']
[63, 'Santa Clause, The (1994)']
[94, 'Home Alone (1990)']


In [60]:
get_movies_based_on_previous_rating(22)

Given Movie [128, 'Supercop (1992)']
Given Movie [258, 'Contact (1997)']
Given Movie [510, 'Magnificent Seven, The (1954)']


[[79, 'Fugitive, The (1993)'],
 [128, 'Supercop (1992)'],
 [144, 'Die Hard (1988)'],
 [147, 'Long Kiss Goodnight, The (1996)'],
 [226, 'Die Hard 2 (1990)'],
 [258, 'Contact (1997)'],
 [429, 'Day the Earth Stood Still, The (1951)'],
 [1006, 'Until the End of the World (Bis ans Ende der Welt) (1991)'],
 [239, 'Sneakers (1992)'],
 [270, 'Gattaca (1997)'],
 [840, 'Last Man Standing (1996)'],
 [177, 'Good, The Bad and The Ugly, The (1966)'],
 [339, 'Mad City (1997)'],
 [686, 'Perfect World, A (1993)'],
 [808, 'Program, The (1993)']]

In [61]:
get_movie_id("Aladdin and the King of Thieves (1996)")

422

In [62]:
get_suggestion_by_movie_title('Toy Story (1995)')

Given Movie [1, 'Toy Story (1995)']


[[422, 'Aladdin and the King of Thieves (1996)'],
 [95, 'Aladdin (1992)'],
 [1219, 'Goofy Movie, A (1995)'],
 [63, 'Santa Clause, The (1994)'],
 [94, 'Home Alone (1990)']]