In [1]:
#Imporing libraries
import os
import pandas as pd
import numpy as np
from functools import reduce
import statistics as stat
import itertools

In [2]:
import session_info
session_info.show()

In [3]:
#Loading ratings data
baseDir = 'D:\\Recommdar_system\\Assignment1\\ml-latest-small'
ratings_df = pd.read_csv(baseDir+"\\ratings.csv")

ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# Droping 'timestamp' column
ratings_df.drop('timestamp', inplace=True, axis=1)

ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
#Loading movies data
movies_df = pd.read_csv(baseDir+'\\movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year']

0       (1995)
1       (1995)
2       (1995)
3       (1995)
4       (1995)
         ...  
9737    (2017)
9738    (2017)
9739    (2017)
9740    (2018)
9741    (1991)
Name: year, Length: 9742, dtype: object

In [7]:
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['year']

0       1995
1       1995
2       1995
3       1995
4       1995
        ... 
9737    2017
9738    2017
9739    2017
9740    2018
9741    1991
Name: year, Length: 9742, dtype: object

In [8]:
movies_df['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [9]:
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title']

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

In [10]:
#Applying the strip function to remove ending whitespace 
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [11]:
def extracted_movies(userId):
    inputMovies_temp = ratings_df.query('userId == @userId', inplace = False)
    return inputMovies_temp


def userSubsetGroup(userId):
    inputMovies_temp = extracted_movies(userId)
    users = ratings_df[ratings_df['movieId'].isin(inputMovies_temp['movieId'].tolist())]
    userSubsetGroup_temp = users.groupby(['userId'])
    userSubsetGroup_temp = sorted(userSubsetGroup_temp,  key=lambda x: len(x[1]) and len(x) > 4, reverse=True) #set th: at least 20 comm mov
    return userSubsetGroup_temp, inputMovies_temp


def pearsonCorrelation(inputMovies, similarUsersGroup):
    
    pearsonCorrelationDict = {}
    
    for name, group in similarUsersGroup:
        group = group.sort_values(by='movieId')
        in_Movies = inputMovies.sort_values(by='movieId')
        temp_df = in_Movies[in_Movies['movieId'].isin(group['movieId'].tolist())]
        tempGroupList = temp_df['rating'].tolist()
        tempRatingList = group['rating'].tolist()

        sXX = 0
        meanOfSelectedUserRating = stat.mean(tempGroupList)
        for i in tempGroupList:
            sXX = sXX + pow((i - meanOfSelectedUserRating),2)
            
        sYY = 0
        meanOfSimilarUserRating = stat.mean(tempRatingList)
        for j in tempRatingList:
            sYY = sYY + pow((j - meanOfSimilarUserRating),2)
            
        sXY = 0
        for i, j in zip(tempGroupList, tempRatingList):
            sXY = sXY+ ((i - meanOfSelectedUserRating ) * (j-meanOfSimilarUserRating))

        if sXX != 0 and sYY != 0:
            pearsonCorrelationDict[name] = sXY/np.sqrt(sXX*sYY)
        else:
            pearsonCorrelationDict[name] = 0                    
    
    topUsers = dict(sorted(pearsonCorrelationDict.items(), key=lambda item: item[1] , reverse= True))
    topUsers = {k:v for (k,v) in topUsers.items() if v > 0.7}
    return topUsers

def create_recommendation (inputMovies, pearsonCorrelationDict):
    topUsersSimilarityScore = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    topUsersSimilarityScore.head()
    topUsersSimilarityScore.columns = ['similarityScore']
    topUsersSimilarityScore['userId'] = topUsersSimilarityScore.index
    topUsersSimilarityScore.index = range(len(topUsersSimilarityScore))
    topSimilarUsers = topUsersSimilarityScore.sort_values(by='similarityScore', ascending=False)
    topUsersRating = topSimilarUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
    topUsersRating.head()
    
    meanRb = topUsersRating.groupby('userId').mean()[['rating']]
    meanRb.columns = ['avgRating']
    meanRb['userId'] = meanRb.index
    meanRb.index = range(len(meanRb))
    topUsersRating = topUsersRating.merge(meanRb, left_on='userId', right_on='userId', how='inner')
    topUsersRating['weightedRatingScore'] = topUsersRating['similarityScore']*(topUsersRating['rating']-topUsersRating['avgRating'])
    meanRating_temp = topUsersRating.groupby('movieId').sum()[['weightedRatingScore']]
    meanRating_temp.columns = ['sum_weightedRatingScore']
    meanRating_temp['movieId'] = meanRating_temp.index
    

    recommendation_df = pd.DataFrame()
    meanRa = inputMovies['rating'].mean()
    recommendation_df['weighted average recommendation score'] = meanRa+(meanRating_temp['sum_weightedRatingScore']/topSimilarUsers['similarityScore'].sum())
    recommendation_df['movieId'] = recommendation_df.index
    recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
    recommendation_df.index = [x for x in range(1,len(recommendation_df)+1)]
    recommendation_df = recommendation_df.rename(columns=  {'weighted average recommendation score': 'scores'})
    
    
    return recommendation_df


In [12]:
user_A = 50
userSubsetGroupA, userAMatrix = userSubsetGroup(user_A)
pearsonCorrelationDictA = pearsonCorrelation(userAMatrix, userSubsetGroupA)
recommended_Movies_A = create_recommendation(userAMatrix, pearsonCorrelationDictA)
recommended_Movies_A = recommended_Movies_A.merge(movies_df, left_on="movieId", right_on ="movieId", how="left")

  userSubsetGroup_temp = sorted(userSubsetGroup_temp,  key=lambda x: len(x[1]) and len(x) > 4, reverse=True) #set th: at least 20 comm mov


In [13]:
recommended_Movies_A.head(20)

Unnamed: 0,scores,movieId,title,genres,year
0,3.250241,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
1,3.056305,608,Fargo,Comedy|Crime|Drama|Thriller,1996
2,3.001023,318,"Shawshank Redemption, The",Crime|Drama,1994
3,2.983899,593,"Silence of the Lambs, The",Crime|Horror|Thriller,1991
4,2.974276,858,"Godfather, The",Crime|Drama,1972
5,2.939153,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,1964
6,2.938716,5952,"Lord of the Rings: The Two Towers, The",Adventure|Fantasy,2002
7,2.931235,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1981
8,2.929801,1208,Apocalypse Now,Action|Drama|War,1979
9,2.907345,110,Braveheart,Action|Drama|War,1995


In [14]:
userB = 100
userSubsetGroupB, userBMatrix = userSubsetGroup(userB)
pearsonCorrelationDictB = pearsonCorrelation(userBMatrix, userSubsetGroupB)
recommended_Movies_B = create_recommendation(userBMatrix, pearsonCorrelationDictB)
recommended_Movies_B = recommended_Movies_B.merge(movies_df, left_on="movieId", right_on ="movieId", how="left")
recommended_Movies_B.head(20)

  userSubsetGroup_temp = sorted(userSubsetGroup_temp,  key=lambda x: len(x[1]) and len(x) > 4, reverse=True) #set th: at least 20 comm mov


Unnamed: 0,scores,movieId,title,genres,year
0,4.186969,318,"Shawshank Redemption, The",Crime|Drama,1994
1,4.137272,858,"Godfather, The",Crime|Drama,1972
2,4.115649,1193,One Flew Over the Cuckoo's Nest,Drama,1975
3,4.102643,2329,American History X,Crime|Drama,1998
4,4.100809,356,Forrest Gump,Comedy|Drama|Romance|War,1994
5,4.082072,260,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,1977
6,4.08177,2300,"Producers, The",Comedy,1968
7,4.072598,48516,"Departed, The",Crime|Drama|Thriller,2006
8,4.071165,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001
9,4.068787,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [15]:
recommended_Movies_B.head(20)

Unnamed: 0,scores,movieId,title,genres,year
0,4.186969,318,"Shawshank Redemption, The",Crime|Drama,1994
1,4.137272,858,"Godfather, The",Crime|Drama,1972
2,4.115649,1193,One Flew Over the Cuckoo's Nest,Drama,1975
3,4.102643,2329,American History X,Crime|Drama,1998
4,4.100809,356,Forrest Gump,Comedy|Drama|Romance|War,1994
5,4.082072,260,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,1977
6,4.08177,2300,"Producers, The",Comedy,1968
7,4.072598,48516,"Departed, The",Crime|Drama|Thriller,2006
8,4.071165,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001
9,4.068787,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995


In [16]:
user_C = 150
userSubsetGroupC, userCMatrix = userSubsetGroup(user_C)
pearsonCorrelationDictC = pearsonCorrelation(userCMatrix, userSubsetGroupC)
recommended_Movies_C = create_recommendation(userCMatrix, pearsonCorrelationDictC)
recommended_Movies_C = recommended_Movies_C.merge(movies_df, left_on="movieId", right_on ="movieId", how="left")


  userSubsetGroup_temp = sorted(userSubsetGroup_temp,  key=lambda x: len(x[1]) and len(x) > 4, reverse=True) #set th: at least 20 comm mov


In [17]:
recommended_Movies_C.head(20)

Unnamed: 0,scores,movieId,title,genres,year
0,4.171236,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
1,4.033804,32,Twelve Monkeys (a.k.a. 12 Monkeys),Mystery|Sci-Fi|Thriller,1995
2,3.967525,356,Forrest Gump,Comedy|Drama|Romance|War,1994
3,3.967224,318,"Shawshank Redemption, The",Crime|Drama,1994
4,3.943837,593,"Silence of the Lambs, The",Crime|Horror|Thriller,1991
5,3.928584,260,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,1977
6,3.909775,858,"Godfather, The",Crime|Drama,1972
7,3.901942,2571,"Matrix, The",Action|Sci-Fi|Thriller,1999
8,3.90065,50,"Usual Suspects, The",Crime|Mystery|Thriller,1995
9,3.886628,47,Seven (a.k.a. Se7en),Mystery|Thriller,1995


## Group Recommendation (Average Aggregation Method)

In [18]:
def recommendation_df_average_method(recommended_Movies_A, recommended_Movies_B, recommended_Movies_C):
    
    temp_recommended_Movies_A = recommended_Movies_A['movieId'].tolist()
    temp_recommended_Movies_B = recommended_Movies_B['movieId'].tolist()
    temp_recommended_Movies_C = recommended_Movies_C['movieId'].tolist()
    combined_movies = set(temp_recommended_Movies_A) & set(temp_recommended_Movies_B) & set(temp_recommended_Movies_C)

    recommendedForA = recommended_Movies_A[recommended_Movies_A['movieId'].isin(combined_movies)]
    recommendedForB = recommended_Movies_B[recommended_Movies_B['movieId'].isin(combined_movies)]
    recommendedForC = recommended_Movies_C[recommended_Movies_C['movieId'].isin(combined_movies)]

    temp_df = [recommendedForA, recommendedForB, recommendedForC]
    combined_result = pd.concat(temp_df)
    recommendation_df_average_method_temp = combined_result.groupby(['movieId'], as_index= False).mean().sort_values(by='scores', ascending=False)
    recommendation_df_average_method_temp.index = [x for x in range(1, len(recommendation_df_average_method_temp)+1)]
    recommendation_df_average_method = recommendation_df_average_method_temp.merge(movies_df, left_on="movieId", right_on ="movieId", how="left")
    return recommendation_df_average_method

In [19]:
recommendation_df_average_method_final = recommendation_df_average_method(recommended_Movies_A, recommended_Movies_B, recommended_Movies_C)
recommendation_df_average_method_final.head(20)

  recommendation_df_average_method_temp = combined_result.groupby(['movieId'], as_index= False).mean().sort_values(by='scores', ascending=False)


Unnamed: 0,movieId,scores,title,genres,year
0,296,3.796165,Pulp Fiction,Comedy|Crime|Drama|Thriller,1994
1,318,3.718405,"Shawshank Redemption, The",Crime|Drama,1994
2,858,3.673774,"Godfather, The",Crime|Drama,1972
3,593,3.66053,"Silence of the Lambs, The",Crime|Horror|Thriller,1991
4,356,3.628205,Forrest Gump,Comedy|Drama|Romance|War,1994
5,260,3.623502,Star Wars: Episode IV - A New Hope,Action|Adventure|Sci-Fi,1977
6,110,3.603532,Braveheart,Action|Drama|War,1995
7,608,3.602139,Fargo,Comedy|Crime|Drama|Thriller,1996
8,50,3.593797,"Usual Suspects, The",Crime|Mystery|Thriller,1995
9,2959,3.592921,Fight Club,Action|Crime|Drama|Thriller,1999


In [20]:
top20Av1 = recommendation_df_average_method_final.head(20)
recommended_Movies_onlyInUserA = list(set(recommended_Movies_A.head(20)["movieId"]) - set(top20Av1["movieId"]))
recommended_Movies_onlyInUserB = list(set(recommended_Movies_B.head(20)["movieId"]) - set(top20Av1["movieId"]))
recommended_Movies_onlyInUserC = list(set(recommended_Movies_C.head(20)["movieId"]) - set(top20Av1["movieId"]))

x = set.intersection(set(recommended_Movies_A.head(20)["movieId"]), set(recommended_Movies_B.head(20)["movieId"]), set(recommended_Movies_C.head(20)["movieId"]) )

y = set.intersection(set(recommended_Movies_onlyInUserA), set(recommended_Movies_onlyInUserB), set(recommended_Movies_onlyInUserC))


print("Movies that are listed in the top 20 movies for each user and in the group recommendation: ", x)
print("Movies that are listed in the top 20 movies for each user but not in the group recommendation: ", y)
print("User A's recommended movies that is not in group recommendation: ", recommended_Movies_onlyInUserA)
print("User B's recommended movies that is not in group recommendation: ", recommended_Movies_onlyInUserB)
print("User C's recommended movies that is not in group recommendation: ", recommended_Movies_onlyInUserC)


Movies that are listed in the top 20 movies for each user and in the group recommendation:  {4993, 110, 593, 858, 318}
Movies that are listed in the top 20 movies for each user but not in the group recommendation:  set()
User A's recommended movies that is not in group recommendation:  [1252, 555, 5902, 16, 1208, 3897, 923, 924]
User B's recommended movies that is not in group recommendation:  [1, 48516, 780, 1203, 2300, 3996, 733, 95]
User C's recommended movies that is not in group recommendation:  [1732, 1222, 2571, 1356, 1197, 1136, 7153, 1210]


In [21]:
def top_listed(group_rec, num, *users_rec):
    recommendation_all = {}
    
    if num == "ALL":
        recommendation_all["group"] = group_rec
        recommendation_all["userA"] = users_rec[0]
        recommendation_all["userB"] = users_rec[1]
        recommendation_all["userC"] = users_rec[2]        
    else:
        recommendation_all["group"] = group_rec.head(num)
        recommendation_all["userA"] = users_rec[0].head(num)
        recommendation_all["userB"] = users_rec[1].head(num)
        recommendation_all["userC"] = users_rec[2].head(num)
    
    return recommendation_all

def whyNot_atomic(movieId, group_rec, *users_rec):
    explanationDict = {}
    
    #if the movie-> I is in the csv file
    explanation1List = []
    allMovies = movies_df["movieId"].tolist()
    if movieId not in allMovies:
        strExp = f'MovieID: {movieId} does not exists in the csv file - Explanation: 1'
        explanation1List.append(strExp)
        explanationDict["Explanation1"] = explanation1List
        return explanationDict
    
    
    #if mocvie-> I is in all group reccommendation
    num = "ALL"
    listOfMovies = top_listed(group_rec, num, *users_rec)
    explanation2List = []
    for key, val in listOfMovies.items():
        
        if(movieId not in val["movieId"].tolist()):
            strExp = f'MovieID: {movieId} is not in {key} reccomendation - Explanation: 2'
            explanation2List.append(strExp)
    
    if(len(explanation2List) != 0):
        explanationDict["Explanation2"] = explanation2List
        return explanationDict

                       
        
    #If the movie is in each user's reccommendation (top 20)
    num = 20
    top20movies = top_listed(group_rec, num, *users_rec)
    explanation3List = []
    for key, val in top20movies.items():
        
        if(movieId not in val["movieId"].tolist()):
            strExp = f'MovieID: {movieId} is not in {key} top{num} reccomendation - Explanation: 3'
            explanation3List.append(strExp)
            
        
    if(len(explanation3List) != 0):
        explanationDict["Explanation3"] = explanation3List

        
            
    # If the movie is in each reccommendation (top 40)
    num = 40
    top40movies = top_listed(group_rec, num, *users_rec)
    explanation4List = []

    for key, val in top40movies.items():
            if(movieId not in val["movieId"].tolist()):
                strExp = f'MovieID: {movieId} is not in {key} top{num} reccomendation - Explanation: 4'
                explanation4List.append(strExp)
    if(len(explanation4List) != 0):
        explanationDict["Explanation4"] = explanation4List
    
    #if explanation2 and explanation3 satisfies then...
    if("Explanation3" in explanationDict and "Explanation4" in explanationDict):
 
        userA_temp = listOfMovies["userA"]
        userB_temp = listOfMovies["userB"]
        userC_temp = listOfMovies["userC"]
        
        indInGR = (group_rec[group_rec["movieId"] == movieId].index.values[0]) + 1
        indInA = (userA_temp[userA_temp["movieId"] == movieId].index.values[0]) + 1
        indInB = (userB_temp[userB_temp["movieId"] == movieId].index.values[0]) + 1
        indInC = (userC_temp[userC_temp["movieId"] == movieId].index.values[0]) + 1
        
        strExp = f'MovieID: {movieId} exists in the reccomendation but num is too low - Explanation: 5. {indInGR}th in the group recommendation. {indInA}th for userA recommendation, {indInB}th for userB recommendation, {indInC}th for userC recommendation, '
        explanationDict["Explanation5"] = strExp
                
    return explanationDict
    

#### Let's select a movie. For example, movie id: 924, name: "2001: A Space Odyssey"

In [22]:
result = whyNot_atomic(924, recommendation_df_average_method_final, recommended_Movies_A, recommended_Movies_B, recommended_Movies_C )
result

{'Explanation3': ['MovieID: 924 is not in group top20 reccomendation - Explanation: 3',
  'MovieID: 924 is not in userB top20 reccomendation - Explanation: 3',
  'MovieID: 924 is not in userC top20 reccomendation - Explanation: 3'],
 'Explanation4': ['MovieID: 924 is not in group top40 reccomendation - Explanation: 4',
  'MovieID: 924 is not in userB top40 reccomendation - Explanation: 4',
  'MovieID: 924 is not in userC top40 reccomendation - Explanation: 4'],
 'Explanation5': 'MovieID: 924 exists in the reccomendation but num is too low - Explanation: 5. 80th in the group recommendation. 19th for userA recommendation, 379th for userB recommendation, 270th for userC recommendation, '}

## Result Explanation
Movie Id:924 (2001: A Space Odyssey) 
* It did not appear in the top 20 group recommendation.
* It did not appear in top 40 group recommendation.
** 47th in group recommendation
* It did not appear in UserB and userC top 40 recommendation.
* It ranks in 
** 16th for User A.
** 81th for User B.
** 156th for User C.

In [23]:
recommendation_all = [recommended_Movies_A, recommended_Movies_B, recommended_Movies_C]

recommended_averageScore = []
for r in recommendation_all:
    a = r["scores"].tolist()
    avg = np.average(a)
    recommended_averageScore.append(avg)
    
selectedMovieScore = []
for r in recommendation_all:
    x = r[r["movieId"] == 1]["scores"]
    x = float(x)
    selectedMovieScore.append(x)
    
    
print("\nAverage score of each user's reccommendation [userA, userB, userC]")
print(recommended_averageScore)

print("\nAverage score for movie name: 2001: A Space Odyssey in each reccommendation [userA, userB, userC]")
print(selectedMovieScore)

score_diff = []
zip_object = zip(selectedMovieScore, recommended_averageScore)
for list1_i, list2_i in zip_object:
    score_diff.append(list1_i-list2_i)
    
print("\nDifference between movie name: 2001: A Space Odyssey reccommendation score and the average score of each movies's reccommendation score")
print(score_diff)
    


Average score of each user's reccommendation [userA, userB, userC]
[2.7806451612903227, 3.9459459459459456, 3.5769230769230766]

Average score for movie name: 2001: A Space Odyssey in each reccommendation [userA, userB, userC]
[2.7977373726480947, 4.06878724770278, 3.6673447528961107]

Difference between movie name: 2001: A Space Odyssey reccommendation score and the average score of each movies's reccommendation score
[0.017092211357772058, 0.12284130175683483, 0.09042167597303408]


## Summary
User B and User C did not like 2001: A Space Odyssey beacuse from result we can see that their ratings for 2001: A Space Odyssey is higher then average score

## Task 2: Group-Why not

In [24]:
def get_genres(movies):
    genres = movies["genres"].tolist()
    
    genres_list = []
    for i in genres:
        remove_separation = i.split("|")
        for s2 in remove_separation:
            genres_list.append(s2)
    
    unique_genres_list = list(set(genres_list))
    
    return genres_list, unique_genres_list

def genres_stat(genres_list, unique_gen):
    genres_dict = {}
    number = len(genres_list)
    for j in unique_gen:
        cnt = genres_list.count(j)
        genres_dict[j] = round(cnt/number, 3)
        
        
    genres_dict = dict(sorted(genres_dict.items(), key=lambda item: item[1], reverse= True))
        
    return genres_dict

def genres_rank(rec):
    genres_list, unique_genres_list = get_genres(rec)
    genres_dict = {}
    for k in unique_genres_list:
        genres_dict[k] = []
        
    for a, b in rec.iterrows():
        gList = b["genres"].split("|")
        for gen in gList:
            genres_dict[gen].append(b["movieId"])
 
    genres_dict = dict(sorted(genres_dict.items(), key=lambda item: item[1], reverse= True))
    
    dict_genres_final = {}
    
    for key, item in genres_dict.items():
        dict_genres_final[key] = []
        for mid in item:

            mid = float(rec[rec["movieId"] == mid]["scores"])
            mid = round(mid, 4)
            dict_genres_final[key].append(mid)
            
    for key, item in dict_genres_final.items():
        dict_genres_final[key] = np.average(item)
               
    dict_genres_final = dict(sorted(dict_genres_final.items(), key=lambda item: item[1], reverse= True))
    df = pd.DataFrame(list(dict_genres_final.items()),columns = ['genres','average_score'])
    return df

In [25]:
#All genres 
allGenres, uniqueGenres_all = get_genres(movies_df)
uniqueGenres_all

['Western',
 'Mystery',
 'Comedy',
 'War',
 'Adventure',
 '(no genres listed)',
 'Action',
 'Crime',
 'Film-Noir',
 'Romance',
 'Sci-Fi',
 'Documentary',
 'Horror',
 'Thriller',
 'Animation',
 'IMAX',
 'Children',
 'Musical',
 'Fantasy',
 'Drama']

In [26]:
top20_total_genre, unique_genre_top20 = get_genres(recommendation_df_average_method_final.head(20))
dictAgregate = genres_stat(top20_total_genre, unique_genre_top20 )
genresRankGR = genres_rank(recommendation_df_average_method_final) 
genresRankGR

Unnamed: 0,genres,average_score
0,Film-Noir,3.48086
1,Crime,3.46903
2,War,3.46851
3,Mystery,3.4616
4,Drama,3.455718
5,IMAX,3.454812
6,Thriller,3.453499
7,Western,3.448325
8,Adventure,3.447976
9,Documentary,3.4462


In [27]:
userA_genre, userA_unique_genre = get_genres(recommended_Movies_A.head(20))
dictAGen = genres_stat(userA_genre, userA_unique_genre )
userA_genre_score = genres_rank(recommended_Movies_A.head(20))
userA_genre_score

Unnamed: 0,genres,average_score
0,Comedy,3.031075
1,Thriller,2.990033
2,Horror,2.9839
3,Crime,2.974911
4,Drama,2.946333
5,War,2.925433
6,Action,2.922767
7,Fantasy,2.91655
8,Adventure,2.90885
9,Romance,2.8786


In [28]:
userB_genre, userB_unique_genre = get_genres(recommended_Movies_B.head(20))
dictBGen = genres_stat(userB_genre, userB_unique_genre )
userB_genre_score = genres_rank(recommended_Movies_B.head(20))
userB_genre_score

Unnamed: 0,genres,average_score
0,Crime,4.09225
1,Drama,4.0917
2,Comedy,4.0838
3,Romance,4.08105
4,War,4.0753
5,Sci-Fi,4.07405
6,Fantasy,4.07
7,Children,4.0688
8,Animation,4.0688
9,Adventure,4.068283


In [29]:
userC_genre, userC_unique_genre = get_genres(recommended_Movies_C.head(20))
dictCGen = genres_stat(userC_genre, userC_unique_genre )
userC_genre_score = genres_rank(recommended_Movies_C.head(20))
userC_genre_score

Unnamed: 0,genres,average_score
0,Horror,3.9438
1,Mystery,3.940333
2,Thriller,3.939
3,Crime,3.937171
4,Drama,3.9235
5,Comedy,3.92258
6,Sci-Fi,3.90692
7,Romance,3.8844
8,War,3.88
9,Action,3.856925


In [30]:
recommended_Movies_onlyInUserA = list(set(userA_unique_genre) - set(unique_genre_top20))
recommended_Movies_onlyInUserB = list(set(userB_unique_genre) - set(unique_genre_top20))
recommended_Movies_onlyInUserC = list(set(userC_unique_genre) - set(unique_genre_top20))

print("User A's genre recommended that is not in the group recommendation: ", recommended_Movies_onlyInUserA)
print("User B's genre recommended that is not in the group recommendation: ", recommended_Movies_onlyInUserB)
print("User C's genre recommended that is not in the group recommendation: ", recommended_Movies_onlyInUserC)


User A's genre recommended that is not in the group recommendation:  ['Film-Noir']
User B's genre recommended that is not in the group recommendation:  ['Children', 'Animation']
User C's genre recommended that is not in the group recommendation:  []


In [31]:

def genres_rating(recommendation):
    gen, unique_gen = get_genres(recommendation)
    dictGen = genres_stat(gen, unique_gen )
    genre_score = genres_rank(recommendation)
    return genre_score

def top_num(gen_rec, num, *userRecs):
    recommendation_all = {}
    if num == "ALL":
        recommendation_all["group"] = gen_rec
        recommendation_all["userA"] = userRecs[0]
        recommendation_all["userB"] = userRecs[1]
        recommendation_all["userC"] = userRecs[2]
        
    else:
        
        recommendation_all["group"] = gen_rec.head(num)
        recommendation_all["userA"] = userRecs[0].head(num)
        recommendation_all["userB"] = userRecs[1].head(num)
        recommendation_all["userC"] = userRecs[2].head(num)
    
    genreAvgList = {}
    for key, val in recommendation_all.items():
        genreAvgList[key] = genres_rating(val)
        
    return genreAvgList

def whyNot_group(genres, gen_rec, *userRecs):
    explanationDict = {}
    gen, unique_gen = get_genres(movies_df)
    explanation1List = []
    if (genres not in unique_gen):
        strExp = f'{genres} does not exists in the csv file - Explanation: 1'
        explanation1List.append(strExp)
        explanationDict["explanation1"] = explanation1List
        return explanationDict
    
    num = "ALL"
    listOfGenInRec = top_num(gen_rec, num, *userRecs)
    
    explanation2List = []
    for key, val in listOfGenInRec.items():
        
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} reccomendation - Explanation: 2'
            explanation2List.append(strExp)
    
    if(len(explanation2List) != 0):
        explanationDict["explanation2"] = explanation2List
        return explanationDict
    
    
    num = 20
    explanation3List = []
    genreAvgList_20 = top_num(gen_rec, num, *userRecs)
    
    for key, val in genreAvgList_20.items():
        
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} top{num} reccomendation -Explanation: 3 '
            explanation3List.append(strExp)
        
    if(len(explanation3List) != 0):
        explanationDict["explanation3"] = explanation3List
            

    num = 40
    explanation4List = []
    genreAvgList_40 = top_num(gen_rec, num, *userRecs)

    for key, val in genreAvgList_40.items():

            if(genres not in val["genres"].tolist()):
                strExp = f'{genres} is not in {key} top{num} reccomendation '
                explanation4List.append(strExp)
                
    if(len(explanation4List) != 0):
        explanationDict["explanation4"] = explanation4List
        
    return explanationDict

In [32]:
result_genre = whyNot_group("Classic", recommendation_df_average_method_final, recommended_Movies_A, recommended_Movies_B, recommended_Movies_C )
result_genre

{'explanation1': ['Classic does not exists in the csv file - Explanation: 1']}