In [20]:
import pandas as pd
pd.set_option('display.max_rows', None)
import requests
pd.options.display.max_columns = 30
from datetime import date
from IPython.display import display, HTML
import time
import string
display(HTML("<style>.container { width:100% !important; }</style>"))

In [21]:
#part I: absorbing and preparing data

In [22]:
def get_data():
    
    print(' Welcome! I am here to help you find your next movie.\n\
        \r Get familiar with readme file to learn details about the way I work\n\
        \r Movie recommendations are prepared using movie data (from TMDB: themoviedb.org)\n\
        \r containing information about 900> films having highest number of votes (ratings)\n\
        \r I may work with already existing file or use TMDB api to get the most up-to-date data.\n\
        \r Second option requires you to have your own api_key - it is fast and easy to get one, but\n\
        \r you need to create a profile on TMBD first.\n')
    
    while True:
        
        api_key = str(input('Insert your api_key to absorb new data (it might take a few minutes), press "N" to use already existing data.\n'))
        
        if api_key == 'N' or api_key == 'n':
            print('Loading data...')
            df = pd.read_excel('moviedb.xlsx', index_col = 'id')
            return df
        
        else:
            url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&sort_by=vote_count.desc&page=1&primary_release_date.lte=2023-03-01&with_watch_monetization_types=flatrate'
            r = requests.get(url) 
            
            if r.status_code != 200:
                print('Something went wrong, try to insert your api key again. You can always use already existing data')
            else:
                print('I am preparing the data, please wait...')
                break
                
                    
    def api_absorb_ids(api_key):
        ''' This function absorbs 1000 movies ID's (movies with the highest vote_count) using TMBD API. One request is limited to 20 movies, so it is necessary to make 50 requests 
            to get 1000 movies - that's why while loop and "page" variable is used. Request not only gets movie ID, but also basic info about movie, but that info is not enough.
            After getting ID's it will be possible to absorb detailed info about every movie. Variable "today" allows to get most up-to-date data'''
        
        today = date.today().strftime('%Y-%d-%m')
        movie_list = []
        page = 1
        
        while len(movie_list) < 50:
            url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&sort_by=vote_count.desc&page=1&primary_release_date.lte={today}&with_watch_monetization_types=flatrate&page={page}'
            r = requests.get(url) 
            movie_list.append(r.json())
            print(f'absorbing movies... {round((page/50)*100)}%', end='\r')
            page += 1
        
        df = pd.json_normalize(movie_list, record_path = 'results')['id']
        return df
        

    df = api_absorb_ids(api_key)


    def api_get_details(df, api_key):
        '''This function gets detailed info for every movie ID. Info contains data about revenue, budget, collection and more. One movie == one request, so for loop is being used here.'''

        details = []
        
        for i in df:
            url = f'https://api.themoviedb.org/3/movie/{i}?api_key={api_key}'
            r = requests.get(url)
            details.append(r.json())
            print(f'absorbing details... {round((len(details)/1000)*100)}%', end='\r')
            
        df = pd.json_normalize(details)
        df.set_index('id', drop = True, inplace = True)            
        
        #column genres contains nested data, code below gets that data and put it in df_genres. after that we drop nested column "genres" from primary df, merge df and df_genres.
        #merging is done on movie id (index). now we have column with genres contining info like: "Action/Thriller" instead of "[{'id': 28, 'name': 'Action'}, {'id': 878, 'na..."
        df_genres = pd.json_normalize(details, record_path = 'genres', record_prefix = 'g_', meta = 'id').groupby('id').agg({'g_name':'/'.join})
        df.drop(columns = 'genres', inplace = True)
        df = df.merge(df_genres, how = 'left', left_index = True, right_index = True)    
        df.rename(columns = {'g_name': 'genres'}, inplace = True)    
        
        return df
        
        
    df = api_get_details(df, api_key)
    
    def api_get_artists(df, api_key):
        '''This function gets detailed info about film crew, one movie == one request again. Data is divided into two parts: 'crew' containing info about director, screenplay, actors etc.
            Second part is 'cast' - it's all about actors, very detailed, containing actors not mentioned in 'crew' sometimes. Function gets info about most important crew jobs (jobs_list)
            and actors, then merges new columns into primary df '''
    
        artists_list = []
    
        for i in df.index:
            url = f'https://api.themoviedb.org/3/movie/{i}/credits?api_key={api_key}&language=en-US'
            r = requests.get(url)
            artists_list.append(r.json())
            print(f'getting info about crews... {round((len(artists_list)/1000)*100)}%', end='\r') 

        crew = pd.json_normalize(artists_list, record_path = 'crew', record_prefix = 'c_', meta = 'id')
        jobs_list = ['Director','Screenplay','Director of Photography']   
        
        #getting 'Director','Screenplay','Director of Photography' from crew
        for i in jobs_list:
            temp_df = crew[crew['c_job'] == i].groupby('id').agg({'c_name': '/'.join}).rename(columns={'c_name': i})
            df = df.merge(temp_df, how = 'left', left_index = True, right_index = True)  
            
        #getting actors from cast
        cast = pd.json_normalize(artists_list, record_path = 'cast', record_prefix = 'c_', meta = 'id')
        cast = cast[cast.c_known_for_department == 'Acting'][['c_name', 'id']]
        cast = cast.groupby('id').agg({'c_name': '/'.join})
        df = df.merge(cast, how = 'left', left_index = True, right_index = True).rename(columns = {'c_name': 'actors'})
        
        return df
        
    df = api_get_artists(df, api_key)

    #creating ROI column - return on budget - describing how much money movie made. ROI is going to be used while making recommendations
    df['ROI'] = round(df.revenue.div(df.budget),1)
    
    #column 'actors' may cointain a lot of text data, and HTML display seems to ignore column width settings. that's why leading_actors column is created - it contains 
    #string 50 characters long, so df is going to look good while using HTML display. primary column 'actors' is still going to be used while choosing right recommendations,
    #but only simplfied column 'leading_actors' is going to be displayed. 
    df['leading_actors'] = df.actors.apply(lambda x: x[:50]+'...' if isinstance(x,str) else pd.NA)
    
    #keep only important columns and rename few of them
    df = df.loc[:,['poster_path','title','belongs_to_collection.name','vote_average','vote_count','genres','leading_actors','actors','Director','Screenplay','Director of Photography',
                   'release_date','runtime','original_language','budget','revenue','ROI']]
    df.rename(columns = {'Director':'director','Screenplay':'screenplay','Director of Photography':'director_of_photography', 'belongs_to_collection.name':'collection'}, inplace = True)
    
    #there should be no duplicates, but just in case
    df = df.drop_duplicates()
    
    #cleaning
    for column in df:
    
        if df[column].dtype == 'O':
            df[column] = df[column].astype('string')
            df.loc[df[column] == "", column] = pd.NA
            df.loc[df[column] == " ", column] = pd.NA
        
        if df[column].dtype == 'int64':        
            df.loc[df[column] == 0, column] = pd.NA
            df[column] = df[column].astype('Int64')

        if df[column].dtype == 'float64':        
            df.loc[df[column] == 0, column] = pd.NA
            df[column] = df[column].astype('Float64')
    
    #'vote_average','genres', 'budget','revenue','ROI' are crucial columns for recommendations, let's drop movies with NAs in those
    df = df.dropna(subset = ['vote_average','genres', 'budget','revenue','ROI'])  
    
    df.budget = (df.budget / 1000000).astype('Int64')
    df.revenue = (df.revenue / 1000000).astype('Int64')
    df.rename(columns = {'budget':'budget_m', 'revenue': 'revenue_m'})
    
    df.vote_average = round(df.vote_average,1)
    
    return df

In [23]:
#part II: creating functions used later during choosing right movie recommendations

In [24]:
def get_users_preferences(df):
    '''This function gets info about user's favourite director and actor, which allows to find user's favourite movie genres - used later while creating masks'''
    
    #getting fav. director and making sure it is included in the dataset. Can be skipped by pressing "N"
    while True:
        dire = str(input('Enter the full name of your favorite director, for example "David Fincher" \n\
        \rIf you do not have a favorite director, press N \n '))

        if dire == 'N':
            break

        elif len(df[df.director.str.contains(dire)]) == 0:
            dire = string.capwords(dire) #using capwords after first check, to avoid mistakes like DiCaprio -> Dicaprio

            if len(df[df.director.str.contains(dire)]) == 0:
                print('Director not found -it might not be inluded in the dataset. Check if you made a typo. Try again\n')
            else:
                break      
        else:
            break
            
    #getting fav. actor and making sure it is icnluded in the dataset. Not skippable.
    while True: 
        act = str(input('Enter the full name of your favorite actor \n '))
            
        if len(df[df.actors.str.contains(act)]) == 0:
            act = string.capwords(act)
                
            if len(df[df.actors.str.contains(act)]) == 0:
                print('Actor not found -it might not be inluded in the dataset. Check if you made a typo. Try again\n')
            else:
                break   
        else:
            break
                    
    
    #getting given director's most common genres (max 4). if user skipped the director, genres will be absorbed using an actor
    if dire == 'N':
        fav_genres = df[df.actors.str.contains(act)]['genres'].str.split('/', expand = True).stack().value_counts().index

    else:
        fav_genres = df[df.director.str.contains(dire)]['genres'].str.split('/', expand = True).stack().value_counts().index
        
    if len(fav_genres) >= 4:
        genres_to_use = 4
        
    else:
        genres_to_use = len(fav_genres)
    
    #returning dict with absorbed preferences: favourite director, favourite actor, favourite genres, number of favourite genres that will be used while creating first mask
    return {'dire': dire, 'act': act, 'fav_genres': fav_genres, 'genres_to_use': genres_to_use}



def create_genres_mask(df, fav_genres, genres_to_use):
    '''Creating mask based on absorbed preferences: fav. genres and numer of genres to use. 
    While preperaing recommendations number of genres might be reduced, to make getting more recommendations possible - mask will be created again '''
    
    if genres_to_use == 4:
        mask_genres = (df.genres.str.contains(fav_genres[0])) & (df.genres.str.contains(fav_genres[1])) & (df.genres.str.contains(fav_genres[2])) & (df.genres.str.contains(fav_genres[3]))

    elif genres_to_use == 3:
        mask_genres = (df.genres.str.contains(fav_genres[0])) & (df.genres.str.contains(fav_genres[1])) & (df.genres.str.contains(fav_genres[2]))

    elif genres_to_use == 2:
        mask_genres = (df.genres.str.contains(fav_genres[0])) & (df.genres.str.contains(fav_genres[1]))

    elif genres_to_use == 1:
        mask_genres = (df.genres.str.contains(fav_genres[0]))
    
    return mask_genres


def create_ROI_mask(df, act):
    '''ROI mask includes movies with ROI >= then median ROI of movies starred by favourite actor. While preperaing recommendations mask might be deactivated in order to get more recommendations.
    Read readme to learn more about ROI indicator'''
    mask_ROI = (df.ROI >= df[df.actors.str.contains(act)].ROI.median())
    return mask_ROI


def create_crew_mask(df, dire, act):
    '''This mask excludes movies starred by favourite director and favourite actor - users probably knows those movies already. '''
    
    if dire == 'N':
        mask_crew = (~df.actors.str.contains(act))
        return mask_crew
    
    else:
        mask_crew = (~df.director.str.contains(dire)) & (~df.actors.str.contains(act))
        return mask_crew
    
    
def create_collection_mask(df, dire, act):
    '''This mask excludes movies that are part of collection, that is probably already known by user. For example: Nolan created "Batman Begins" movie. 
    If Nolan is your fav. director, then you will see no movies from Dark Knight trilogy in recommendations.'''
    
    collection_list = df[df.actors.str.contains(act)]['collection'][df.collection.notna()].to_list()
    
    if dire != 'N':
        collection_list.extend(df[df.director.str.contains(dire)]['collection'][df.collection.notna()])
    
    mask_collection = (~df.collection.isin(collection_list))
    return mask_collection


In [25]:
#part 3: creating recommendations

In [26]:
def movie_recommendation(df):
    
    df.poster_path = '<img src="http://image.tmdb.org/t/p/w185/'+ df.poster_path + '" style=max-height:124px;"/>'
    
    df_seen_already = pd.DataFrame()
    
    preferences = get_users_preferences(df)
    mask_genres = create_genres_mask(df, preferences['fav_genres'], preferences['genres_to_use'])
    mask_ROI = create_ROI_mask(df, preferences['act'])
    mask_crew = create_crew_mask(df, preferences['dire'], preferences['act'])
    mask_collection = create_collection_mask(df, preferences['dire'], preferences['act'])
    
    while True:
        #creating set of recommendations and asking if it's good enough: if not or df_rec (dataframe with recommendations) is empty - next df_rec will be made without ROI limitation
        print('Preparing recommendations...')
        df_rec = df[mask_genres & mask_ROI & mask_crew & mask_collection & (~df.index.isin(df_seen_already.index)) & (df.vote_average >= 6.5)]
        df_seen_already = pd.concat([df_seen_already, df_rec], axis = 0)
            
        if len(df_rec) > 0:
            display(HTML(df_rec.sort_values(by = 'vote_average', ascending = False).drop(columns='actors').style.set_sticky(axis = 1).format({'vote_average': "{:.1f}", 'ROI':"{:.1f}"}).to_html(escape = False)))
            #Style sticky freezes headers, HTML bring posters on
            
            while True:
                remove_ROI_threshold = str(input('Press Y if you want me to find another recommendations Y/N \n'))

                if remove_ROI_threshold != 'Y' and remove_ROI_threshold != 'N':
                    print('Please press "Y" or "N" \n')
                else:
                    break       

        else:
            remove_ROI_threshold = 'Y'
        
        #creating another set of recommendations without ROI limitation and asking it's good enough: 
        #if not or df_rec is empty - reduce (-1) number of genres recomm. are based on. another df_rec will be created
        #at the beggining of the loop, with reduced genres but including ROI limitation again. 
        if remove_ROI_threshold == 'N':
            print('Thank you for using me to find your next movie. Bye!')
            break
            
        else:
            print('Working on it...')
            time.sleep(1)
            df_rec = df[mask_genres & mask_crew & mask_collection & (~df.index.isin(df_seen_already.index)) & (df.vote_average >= 6.5)]
            df_seen_already = pd.concat([df_seen_already, df_rec], axis = 0)
            
            
        if len(df_rec) > 0:
            display(HTML(df_rec.sort_values(by = 'vote_average', ascending = False).drop(columns='actors').style.set_sticky(axis= 1).format({'vote_average': "{:.1f}", 'ROI':"{:.1f}"}).to_html(escape = False)))
            
            while True:
                remove_last_genre = str(input('Press Y if you want me to find another recommendations Y/N \n'))
                    
                if remove_last_genre != 'Y' and remove_last_genre != 'N':
                    print('Please press "Y" or "N" \n')
                else:
                    break      
        else:
            remove_last_genre = 'Y'
            
            
            
        if remove_last_genre == 'N':
            print('Thank you for using me to find your next movie. Bye!')
            break  
            
        elif remove_last_genre == 'Y' and preferences['genres_to_use'] == 1:
            print(f'No more recommendations available. You saw {len(df_seen_already)} recommendations')
            break #above elif means you created last set possible, based on only one genre and no ROI limitation
            
        else:
            preferences['genres_to_use'] -= 1
            mask_genres = create_genres_mask(df, preferences['fav_genres'], preferences['genres_to_use'])

In [27]:
df = get_data()
movie_recommendation(df)

 Welcome! I am here to help you find your next movie.
 Get familiar with readme file to learn details about the way I work
 Movie recommendations are prepared using movie data (from TMDB: themoviedb.org)
 containing information about 900> films having highest number of votes (ratings)
 I may work with already existing file or use TMDB api to get the most up-to-date data.
 Second option requires you to have your own api_key - it is fast and easy to get one, but
 you need to create a profile on TMBD first.

Insert your api_key to absorb new data (it might take a few minutes), press "N" to use already existing data.
N
Loading data...
Enter the full name of your favorite director, for example "David Fincher" 
        
If you do not have a favorite director, press N 
 David Fincher
Enter the full name of your favorite actor 
 Al Pacino
Preparing recommendations...


Unnamed: 0_level_0,poster_path,title,collection,vote_average,vote_count,genres,leading_actors,director,screenplay,director_of_photography,release_date,runtime,original_language,budget,revenue,ROI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
322,,Mystic River,,7.7,5720,Thriller/Crime/Drama/Mystery,Sean Penn/Tim Robbins/Kevin Bacon/Laurence Fishbur...,Clint Eastwood,Brian Helgeland,Tom Stern,2003-02-20,138,en,25,156,6.3
395834,,Wind River,Wind River Collection,7.4,4530,Crime/Drama/Mystery/Thriller,Jeremy Renner/Elizabeth Olsen/Gil Birmingham/Graha...,Taylor Sheridan,,Ben Richardson,2017-08-03,107,en,11,44,4.1


Press Y if you want me to find another recommendations Y/N 
Y
Working on it...


Unnamed: 0_level_0,poster_path,title,collection,vote_average,vote_count,genres,leading_actors,director,screenplay,director_of_photography,release_date,runtime,original_language,budget,revenue,ROI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7485,,Shooter,,7.1,4181,Action/Drama/Mystery/Thriller/Crime,Mark Wahlberg/Michael Peña/Danny Glover/Kate Mara/...,Antoine Fuqua,,Peter Menzies Jr.,2007-03-22,124,en,61,95,1.6


Press Y if you want me to find another recommendations Y/N 
Y
Preparing recommendations...


Unnamed: 0_level_0,poster_path,title,collection,vote_average,vote_count,genres,leading_actors,director,screenplay,director_of_photography,release_date,runtime,original_language,budget,revenue,ROI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
670,,Oldboy,,8.3,7372,Drama/Thriller/Mystery/Action,Choi Min-sik/Yoo Ji-tae/Kang Hye-jung/Kim Byeong-o...,Park Chan-wook,Park Chan-wook/Hwang Jo-yoon/Lim Joon-hyung,Jeong Jeong-hun,2003-11-21,120,ko,3,15,5.1
11324,,Shutter Island,,8.2,21451,Drama/Thriller/Mystery,Leonardo DiCaprio/Mark Ruffalo/Ben Kingsley/Michel...,Martin Scorsese,Laeta Kalogridis,Robert Richardson,2010-02-14,138,en,80,294,3.7
745,,The Sixth Sense,,7.9,10319,Mystery/Thriller/Drama,Bruce Willis/Haley Joel Osment/Toni Collette/Olivi...,M. Night Shyamalan,,Tak Fujimoto,1999-08-06,107,en,40,672,16.8
2501,,The Bourne Identity,The Bourne Collection,7.5,8288,Action/Drama/Mystery/Thriller,Matt Damon/Franka Potente/Chris Cooper/Clive Owen/...,Doug Liman,Tony Gilroy/W. Blake Herron,Oliver Wood,2002-06-14,119,en,60,214,3.6
2503,,The Bourne Ultimatum,The Bourne Collection,7.4,6859,Action/Drama/Mystery/Thriller,Matt Damon/Julia Stiles/David Strathairn/Scott Gle...,Paul Greengrass,George Nolfi/Scott Z. Burns/Tony Gilroy,Oliver Wood,2007-08-03,115,en,70,442,6.3
251,,Ghost,,7.2,4907,Fantasy/Drama/Thriller/Mystery/Romance,Patrick Swayze/Demi Moore/Whoopi Goldberg/Tony Gol...,Jerry Zucker,,Adam Greenberg,1990-07-12,127,en,22,505,23.0
9741,,Unbreakable,,7.1,8308,Thriller/Drama/Mystery,Bruce Willis/Samuel L. Jackson/Robin Wright/Spence...,M. Night Shyamalan,,Eduardo Serra,2000-11-22,106,en,75,248,3.3
2675,,Signs,,6.7,4920,Drama/Thriller/Science Fiction/Mystery,Mel Gibson/Joaquin Phoenix/Rory Culkin/Abigail Bre...,M. Night Shyamalan,,Tak Fujimoto,2002-08-02,106,en,72,408,5.7


Press Y if you want me to find another recommendations Y/N 
N
Thank you for using me to find your next movie. Bye!
