# Generate Recommendations for Non-Users

1. Top rated movies within a filter: weighted average of # of reviews and average ratings - DONE
    - Genomes: only display if above X threshold? Or add relevance score into the weighted average?
2. Item-Item recommendation if they liked movie X - TO DO 

UI elements to do:
- Biographic information for the actor, director (IMDB) (?)
- Add spell check to actor names 

UI in the future: 
- Filtering within personalized recommendations (user ID field)
- Tab allowing users to input own ratings and get a recommendation out
- EDA 

#### To Run:
1. Convert notebook to py file
    - Run in command line: py -m jupyter nbconvert --to script streamlit_example.ipynb
2. Run streamlit app
    - Run in command line: streamlit run streamlit_example.py

In [231]:
import pandas as pd
import os
import numpy as np
from scipy.sparse import csc_matrix
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import streamlit as st
import fastparquet

In [232]:
def load_data():
    
    ### processed data from recommendation data exploration.ipynb
    # movie attributes
    df = pd.read_parquet('movies_processed.parq')
    # number of and average ratings by movie
    movie_ratings = pd.read_parquet('movies_ratings.parq')

    ### data to get additional attributes for display
    links = pd.read_csv('data/ml-25m/links.csv')
    imdb_movies = pd.read_csv('data/imdb/IMDb movies.csv')

    ### genome tags for filtering
    tags = pd.read_csv('data/ml-25m/genome-tags.csv')
    relevance = pd.read_csv('data/ml-25m/genome-scores.csv')
    
    return df, movie_ratings, links, imdb_movies, tags, relevance

## Weighted Average Ratings 
\# of ratings * average rating    
If just use average rating, many movies only reviewed once or twice. Want highly rated, frequently watched movies   
   
This does weight poorly rated, but frequently watched movies higher place than well rated, infrequently movies

In [233]:
def weighted_avg(movie_ratings, df):
    # calculate weighted average
    movie_ratings['weighted_avg'] = movie_ratings.avg * movie_ratings.cnt

    # merge with df with movie attributes
    # LEFT merge so keep movies with no ratings (weighted avg = 0) -- can still recommend if fit specific filters
    df = pd.merge(df, movie_ratings[['movieId', 'weighted_avg', 'cnt', 'avg']], on = 'movieId', how = 'left')

    # replace nulls to 0 
    # NOT avg: should display missing if missing
    for var in ['weighted_avg', 'cnt']:
        df[var] = np.where(df[var].isnull(), 0, df[var])
        
    return df

## Merge in additional IMDB attributes

In [234]:
def imdb_merge(imdb_movies, links, df):
    # standardize IMDB IDs
    imdb_movies['imdbId'] = imdb_movies.imdb_title_id.str.split('tt').str[1]
    imdb_movies.imdbId = pd.to_numeric(imdb_movies.imdbId)
    
    x = len(df)
    # merge links to identify IMDB movies
    df = pd.merge(df, links[['movieId', 'imdbId']], on = 'movieId')
    # merge specific IMDB attributes
    df = pd.merge(df, imdb_movies[['imdbId', 'description', 'language', 'duration', 'production_company']])
    assert x == len(df)
    
    return df

## Merge in Genome Tags
Limit to tags with > 75% relevant in a movie    
__Extension__: incorporate relevant score into weighted average when decide what to show

In [235]:
def genome_merge(tags, relevance, df):
    # merge tags and relevance scores
    tags = pd.merge(tags, relevance, on = 'tagId')

    # limit to tags > 75% relevant to a movie 
    # get list of relevant tags per movie
    tags_relevant = tags[tags.relevance > 0.75].groupby('movieId').tag.apply(list).to_frame()
    tags_relevant.columns = ['tags']

    # merge with dataframe
    # LEFT merge because want to keep movie even if doesn't have any tags
    df = pd.merge(df, tags_relevant, left_on = 'movieId', right_index = True, how = 'left')
    # replace missing tags to empty list
    df.tags = df.tags.apply(lambda d: d if isinstance(d, list) else [])
    
    return df

## Downcase Actors, Directors so can match user input non case-sensitive  
Keep non-downcased version for displaying

In [236]:
def downcasing(df):
    df['actors_downcased'] = df.actors_lst.apply(lambda row: [i.lower() for i in row])
    df['directors_downcased'] = df.director_lst.apply(lambda row: [i.lower() for i in row])
    return df

## Get Unique Lists of Filter Options

In [237]:
def cat_list_expand(df, var):
    
    # expand lists such that one entry per row 
    expanded = df[[var, 'movieId']]
    expanded = pd.DataFrame({
        col:np.repeat(expanded[col].values, expanded[var].str.len()) for col in expanded.columns.drop(var)}
    ).assign(**{var:np.concatenate(expanded[var].values)})[expanded.columns]

    return expanded

In [238]:
def unique_lists(df):

    # language to list
    for var in ['language']:
        df[var + '_lst'] = df[var].str.split(', ')
        df[var + '_lst'] = df[var + '_lst'].apply(lambda d: d if isinstance(d, list) else [])
    
    # unique lists. Sort alphabetically
    genres_unique = np.sort(cat_list_expand(df, 'genres_all').genres_all.unique())
    actors_unique = np.sort(cat_list_expand(df, 'actors_downcased').actors_downcased.unique())
    directors_unique = np.sort(cat_list_expand(df, 'directors_downcased').directors_downcased.unique())
    countries_unique  = np.sort(cat_list_expand(df, 'country_lst').country_lst.unique())
    language_unique = np.sort(cat_list_expand(df, 'language_lst').language_lst.unique())
    tags_unique = np.sort(cat_list_expand(df, 'tags').tags.unique())
    
    return genres_unique, actors_unique, directors_unique, countries_unique, language_unique, tags_unique

## Set up DataFrame for Display

In [239]:
def display_dataframe(df):
    
    df_display = df.copy()
    # strip year out of title
    df_display['title'] = df_display.title_eng.str.split('(').str[0]
    # drop columns
    df_display = df_display.drop(columns = ['movieId', 'title_eng', 'imdbId', 'language'])
    # rename + reorder columns
    df_display.columns = ['Year', 'Genres', 'Director(s)', 'Actors', 'Filming Countries', 'weighted_avg', 'Number of Ratings', 
                         'Average Rating', 'Description', 'Duration (Minutes)', 'Production Company', 
                         'Tags', 'actors_downcased', 'directors_downcased', 'Language(s)', 'Title']
    df_display = df_display[['Title', 'Year', 'Description','Duration (Minutes)', 'Genres', 'Actors', 'Director(s)', 
                             'Production Company', 'Filming Countries', 'Language(s)', 'Tags',
                             'Number of Ratings', 'Average Rating', 'weighted_avg', 'actors_downcased', 'directors_downcased']]

    return df_display

# Run above functions w. cacheing
These functions will not change when user inputs to filtering change, so only run first time app is opened     
st.cache not compatabile with jupyter notebook. Comment out when running in jupyter. 

In [249]:
@st.cache
def cached_functions():
    
    # read in data
    df, movie_ratings, links, imdb_movies, tags, relevance = load_data()
    # calculate weighted average for sorting
    df = weighted_avg(movie_ratings, df)
    # merge in IMDB metadata and tags 
    df = imdb_merge(imdb_movies, links, df)
    df = genome_merge(tags, relevance, df)
    # downcase actors and directors so match user input non case-sensitive (keep regular casing for display) 
    df = downcasing(df)

    # get unique lists of all filter values
    genres_unique, actors_unique, directors_unique, countries_unique, language_unique, tags_unique = unique_lists(df)
    
    # format df for display
    df_display = display_dataframe(df)
    
    return df, df_display, genres_unique, actors_unique, directors_unique, countries_unique, language_unique, tags_unique

In [255]:
df, df_display, genres_unique, actors_unique, directors_unique, countries_unique, language_unique, tags_unique = cached_functions()

## Display in Streamlit with filter options
Display:
- Title
- Year
- Description
- Duration
- Genres
- Actors
- Directors
- Production Company
- Country
- Language
- Genome Tags
- Number of ratings
- Average rating    
   
Filter by:
- Genres
- Actors
- Directors
- Country
- Language
- Genome Tags

Default table is highest rated movies without filters    
   
   
Extensions:
- AND/OR advanced search option? This might be difficult

In [None]:
st.title('Top Rated Movie Recommendations')
st.header('Enter desired filters and select "Display Recommendations" \n')
st.write('Please note filters use AND logic')
st.write('If you wish to see overall top rated movies, select Display Recommendations without any filters')

In [191]:
# get user inputs: multiple selection possible per category
genre_input = st.multiselect('Select genre(s)', genres_unique)
country_input = st.multiselect('Select country(s)', countries_unique)
language_input = st.multiselect('Select language(s)', language_unique)
tag_input = st.multiselect('Select genome tags(s)', tags_unique)

# actors, directors get text inputs
# Dropdowns too much for streamlit to handle
# allow multiple entires
actor_input = st.text_input('Type actor(s) names separated by comma')
if actor_input != '':
    # downcase input
    actor_input = actor_input.lower()
    # split into list 
    actor_input = actor_input.split(', ')
    # check valid actor/in dataframe. Else, notify and do not apply filter
    for i in actor_input:
        if i not in actors_unique:
            st.write('Invalid actor', i)
            actor_input = []
else:
    actor_input = []

director_input = st.text_input('Type director(s) names separated by comma')
if director_input != '':
    # downcase input
    director_input = director_input.lower()
    # split into list
    director_input = director_input.split(', ')
    # check valid director/in dataframe. Else, notify and do not apply filter
    for i in director_input:
        if i not in directors_unique:
            st.write('Invalid director', i)
            director_input = []
else:
    director_input = []

# display recommendations once hit button
if st.button('Display Recommendations'):
    # filter dataframe
    df_filtered = df_display[(df_display.Genres.map(set(genre_input).issubset)) & 
                             (df_display['Filming Countries'].map(set(country_input).issubset)) &
                             (df_display['Language(s)'].map(set(language_input).issubset)) & 
                             (df_display.Tags.map(set(tag_input).issubset))  & 
                             (df_display.actors_downcased.map(set(actor_input).issubset)) &
                             (df_display.directors_downcased.map(set(director_input).issubset))
                            ].sort_values('weighted_avg', ascending = False).head(10).drop(columns = ['weighted_avg',
                                                                                                     'actors_downcased', 
                                                                                                      'directors_downcased'])
    # if no valid movies with combination of filters, notify. Else display dataframe
    if len(df_filtered) > 0:
        st.write(df_filtered)
    else:
        st.write('Found no movies that match your selections')