# Cinema Recommender System 

Using EasyGUI interface, we have made a short cinema recommender system by implementing similarity techniques and search algorithms

Importing necessary libraries 

In [2]:
from easygui import *
import easygui as gui 
import pandas as pd 
import numpy as np 
from fuzzywuzzy import process
import sys
#from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

We are using three datasets namely:
    tags.csv, movies.csv, ratings.csv 
    
    
FInally we are combining all three datasets into one

In [3]:
#reading dataset

tags_data = pd.read_csv("tags.csv", usecols = [1,2])
movies_data = pd.read_csv("movies.csv")
ratings_data = pd.read_csv("ratings.csv", usecols = [1,2])
    

In [4]:
tags_data.head()

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA


In [5]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_data.head()

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0


In [7]:
# merging movies data and ratings data into new dataframe

final_data = pd.merge(movies_data ,ratings_data, on='movieId', how='outer')
final_data['rating'] = final_data['rating'].fillna(final_data['rating'].mean()) 

# gouping df rows by average rating per movie
final_data = pd.DataFrame(final_data.groupby('movieId')['rating'].mean().reset_index().round(1))

#merging tags_data with previous dataframe

final_data = pd.merge(final_data, tags_data ,on='movieId', how='outer')
final_data['tag'] = final_data['tag'].fillna('')
final_data = pd.DataFrame(final_data.groupby(['movieId','rating'])['tag'].apply('|'.join).reset_index())
final_data['title'] = movies_data['title']
final_data['genres'] = movies_data['genres']

In [8]:
final_data.head()

Unnamed: 0,movieId,rating,tag,title,genres
0,1,3.9,pixar|pixar|fun,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.4,fantasy|magic board game|Robin Williams|game,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.3,moldy|old,Grumpier Old Men (1995),Comedy|Romance
3,4,2.4,,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,3.1,pregnancy|remake,Father of the Bride Part II (1995),Comedy


In [9]:
final_data.shape

(9742, 5)

Finally there are {9742} rows and {5}columns

    Linear Kernel is used when the data is Linearly separable, that is, it can be separated using a single Line. It is one of the most common kernels to be used. It is mostly used when there are a Large number of Features in a particular Data Set. One of the examples where there are a lot of features, is Text Classification, as each alphabet is a new feature. So we mostly use Linear Kernel in Text Classification.

    Term frequency-inverse document frequency is a text vectorizer that transforms the text into usable vector. It combines 2 concepts, Term Frequency (TF) and Document Frequency (DF).
    
    The term frequency is the number of occurrences of a specific term in a document. 
    Document frequency is the number of documents containing a specific term. 
    
    Inverse document frequency(IDF) is the weight of a term,it aims to reduce the weight of a term if the term’s occurrences are scattered throughout all the documents. 

In [10]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

matrix = tf.fit_transform(final_data['genres'])

cosine_similarities = linear_kernel(matrix,matrix)

movie_title = final_data['title']

indices = pd.Series(final_data.index, index=final_data['title'])

In [None]:
def choose():
    ''' 
    This functions lets you to search movies by tag or by genre or by movie rating.
    Based on user choice, user gets directed to next window.
    
    '''
    
    # defining easygui choicebox parameters
    all_image = "image_2.jpeg"
    msg = "Select any option:"
    title = "Cinema Recommender"
    choices = ["Search movies by Genre","Search movies by Tag",
               "Sort movies by Rating","Search Similar Movies","Return to Main menu"]
    fieldValues = buttonbox(msg,title,choices,  image=all_image)
    
    if fieldValues == "Search movies by Genre":
        genre_option()
    
    elif fieldValues == "Search movies by Tag":
        tag_option()
        
    elif fieldValues == "Sort movies by Rating":
        rating_option()
        
    elif fieldValues == "Search Similar Movies":
        similar_movies()
        
    elif fieldValues == "Return to Main menu":
        home_page()
        
    else:
        choose()
              
def check_values(msg, title,fieldNames):
    '''
    This function examines the multienterbox for missing user input values 
    and returns the user input as the fieldValues variable.
    '''
    
    fieldValues = multenterbox(msg, title, fieldNames)
    
    while 1:
        if fieldValues is None: break
        errmsg = ""
        for i in range(len(fieldNames)):
            if fieldValues[i].strip() == "":
                errmsg += ('"%s" is a required field.\n\n' % fieldNames[i])
        if errmsg == "":
            break # if no empty fields found, proceed with next codeblock
        # Saving user input as list in fieldValues variable
        fieldValues = multenterbox(errmsg, title, fieldNames, fieldValues)
    
    return fieldValues

def create_genre_list(genre_input):
    genre_list = movies_data['genres'].unique()
    final = similarity_test(genre_input, genre_list)
    
    if final == []:
        genre_option()
    else:
        genre()

def genre_option():
    ''' 
    This function sets the easygui multenterbox parameters and executes the field check methods; 
    if the user input is retrieved, the function does a similarity test; if no match is discovered, 
    the user is returned to the same window.
    '''
    
    genre_list = movies_data['genres'].unique()
    my_list =[]
    for i in genre_list:
        j = i.split("|")
        for word in j:
            if word not in my_list:
                my_list.append(word)
    msg = f"Choose your favorite genre from the list below :\n {my_list[ : -1]} \n You'll return to this window if the genre you're looking for isn't found."
    title = "Search by genre"
    inputField = ["Genre"]
    
    # calling on field_check() to check for missing user input and to
    # save user input as fieldValues variable
    user_input = check_values(msg, title,inputField)
    
    # If user input is not empty, slice list element and save in variable
    if user_input != None:
        global user_genre_input
        user_genre_input = user_input[0]
        
    # here we call on a function which basically tests for string
    # similarity. if user press cancel, user gets returned to main menu 
        create_genre_list(user_genre_input)
    else:
        choose()
        
        
def genre():
    '''
    This function adds all genre-matched movies and sort movies to display
    only movies rated >= 2.5.  
    '''
    
    final_1 = []
    for i in final:
        final_1.append(movies_data.loc[movies_data['genres'].isin(i)])
    
    # reset df index, drop index columns as well as duplicate entries
    lst = final_1[0]
    lst = lst.reset_index()
    lst.drop('index', axis=1, inplace=True)
    lst.drop('title', axis=1, inplace=True)
    lst.drop('genres', axis=1, inplace=True)
    lst = lst.drop_duplicates(subset='movieId')
    
    # merge movieId with movie names, rating and genres + drop index, title and genres column
    df = pd.merge(lst, final_data, on='movieId', how='left')
    
    # sort movies by ratings, display only movies rated above or = 2.5 rating
    data = df.sort_values(by='rating', ascending=False)
    data.drop('movieId', axis=1, inplace=True)
    data = data[data['rating'] >= 2.5]
    
    ## For displaying
    
    heading = [] # add column names as first dataframe row for easygui display
    heading.insert(0, {'rating': 'Rating', 'title':'Title'})
    data = pd.concat([pd.DataFrame(heading), data], ignore_index=True, sort=True)
    
    rating = data['rating'].tolist()
    title = data['title'].tolist()
    
    data = np.concatenate([np.array(i)[:,None] for i in [rating,title]], axis=1)
    data = str(data).replace('[','').replace(']','')
    
    # displaying matched movies to user
    gui.codebox(msg='Movies filtered by genre:',
    text=(data),title='Movie List')
    
    choose()

def create_tag_list(tag_input):
    tag_list = tags_data['tag'].unique()
    final = similarity_test(tag_input, tag_list)
    
    if final == []:
        tag_option()
    else:
        tag()

def tag_option():
    ''' 
    This function sets the easygui multenterbox parameters and executes the field check methods; 
    if the user input is retrieved, the function does a similarity test; 
    if no match is discovered, the user is returned to the same window.  
    '''
    
    msg = f"Enter your favourite tag from below list:\n Writing,fantasy,politics, old, fun \nIf tag not found you will be returned to this window"
    title = 'Search tag'                        
    inputField = ["Tag"]
    
    user_input = check_values(msg, title, inputField)
    
    # If user input is not empty, slice list element and save in variable
    if user_input != None:
        global user_tag_input
        user_tag_input = user_input[0]
        create_tag_list(user_tag_input)
        
    else:
        choose()
        
def tag():
    '''
    This function adds all tag-matched movies and 
    sort movies to display only movies with a rating of >= 2.5, 
    '''
    
    final_1 = []
    for i in final:
        final_1.append(tags_data.loc[tags_data['tag'].isin(i)])
    
    lst = final_1[0]
    lst = lst.reset_index()
    lst.drop('index', axis=1, inplace=True)
    lst = lst.drop_duplicates(subset='movieId')
    
# merge movieId with movie names and genres + drop tag and movieId column
    df = pd.merge(lst, final_data, on='movieId', how='left')
    df.drop('movieId', axis=1, inplace=True)
    
    
# sort movies by ratings, display only movies rated above or = 2.5 rating
    data = df.sort_values(by='rating', ascending=False)
    data = data[data['rating'] >= 2.5]
    heading = [] # add column names as first dataframe row for easygui display
    heading.insert(0, {'rating': 'Rating', 'title': 'Title'})
    data = pd.concat([pd.DataFrame(heading), data], ignore_index=True, sort=True)
    
    # casting dataframe columns to lists
    rating = data['rating'].tolist()
    title = data['title'].tolist()
    
    # compiling numpy array from dataframe column lists for easygui display
    data = np.concatenate([np.array(i)[:,None] for i in [rating,title]], axis=1)
    data = str(data).replace('[','').replace(']','')
    
    # displaying matched movies to user
    gui.codebox(msg='Movies filtered by tag :',
    text=(data),title='Movie List')
    
    choose()

def rate(a,b):

    sorted_data = final_data[(final_data['rating'] > a) & (final_data['rating'] <= b)]
    
    data = sorted_data.sort_values(by='rating', ascending=False)
    data.drop('movieId', axis=1, inplace=True)

    ## For displaying
    
    heading = [] # add column names as first dataframe row for easygui display
    heading.insert(0, {'rating': 'Rating', 'title': 'Title'})
    data = pd.concat([pd.DataFrame(heading), data], ignore_index=True, sort=True)

    rating = data['rating'].tolist()
    title = data['title'].tolist()


    data = np.concatenate([np.array(i)[:,None] for i in [rating,title]], axis=1)
    data = str(data).replace('[','').replace(']','')

    # displaying matched movies to user

    gui.codebox(msg='Movies filtered by rating:',
    text=(data),title='Movie List')

    choose()
    
def rating_option():
    
    msg = "Select below rating option:"
    title = "Search by Rating"
    choices = ["Rating between 4 - 5 star:", "Rating between 3 - 4 star:",
               "Rating between 2 - 3 star:", "Rating Below 2 star:"]
    fieldValues = choicebox(msg,title,choices)
    
    if fieldValues == "Rating between 4 - 5 star:":
        rate(4.0,5.0)
    
    elif fieldValues == "Rating between 3 - 4 star:":
        rate(3.0,4.0)
        
    elif fieldValues == "Rating between 2 - 3 star:":
        rate(2.0,3.0)
    
    elif fieldValues == "Rating Below 2 star:":
        rate(1.0,2.0)
        
    if fieldValues is None:
        choose()
        
def movie_recommend(original_title):

    idx = indices[original_title]

    scores = list(enumerate(cosine_similarities[idx]))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    scores = scores[1:11]

    movie_indices = [i[0] for i in scores if i[0]!= idx]

    title_list = movie_title.iloc[movie_indices]
    
    ## For displaying 
    
    heading = [] # add column names as first dataframe row for easygui display
    heading.insert(0, {'title': 'Title'})
    data = pd.concat([pd.DataFrame(heading), title_list], ignore_index=True, sort=True)

    data = np.concatenate([np.array(i)[:,None] for i in [title_list]], axis=1)
    data = str(data).replace('[','').replace(']','')
    
    gui.codebox(msg=f'Similar movies to {original_title} :',
        text=(data),title='Movie List')
    
    choose()
        
def similar_movies():
    msg = "Select movies:"
    title = "To find Similar movies"
    a = final_data["title"].sample(n=10)
    new_list = []
    for i in a:
        new_list.append(i)
    choices = [u for u in new_list]
    
    users_input = choicebox(msg,title,choices)
    
    if users_input!=None:
        movie_recommend(users_input)
            
    else:
        choose()

        
def similarity_test(user_input, lst):
    '''
    This function checks for string similarity by comparing user input to movie genres;
    matches that are greater than 90% are kept in a variable.
    '''
    query = user_input
    choices = lst 
    
    # here fuzzywuzzy does its magic to test for similarity
    output = process.extract(query, choices)
    
    # saving returned matches in variable and passing it on to next function
    global final
    final = [i for i in output if i[1] > 90]
    
    return final

def close():
    image = "thankyou.jpeg"
    msg = "Thank you for exploring the movies"
    title = "Bye Bye"
    choices = ["Quit"]
    options = buttonbox(msg,title,choices,image = image)
    
    if options == "Quit":
        exit()
        
    else:
        close()

        
def home_page():
    image = "image_1.jpeg"
    msg = "Hey guys! Are u bored! Wanna explore movies which u haven't heard of? Come, lets see some list of movies !!! "
    title = "Welcome to Cinema Recommender"
    choices = ["Lets go","Close"]
    options =  buttonbox(msg,title,choices,image = image)
    if options == "Lets go":
        choose()
        
    elif options == "Close":
        close()
        
    else:
        home_page()
    
if __name__ == '__main__':
    home_page()