# Prepare data for recommendations display 
- Merge movie data with all meta-data for display + filtering
- Calculate weighted average of ratings
    - Primary sort key for non user filter recommendations
    - Secondary sort key for same cosine similarity in user-item and item-item recommendations
- Downcased versions of variables that user may input (actor, diretor for filters and title for item-item)
- Order and rename columns for display - include non-display columns that are needed for setup at the end
- Save as parquet

In [1]:
import pandas as pd
import os
import re
import numpy as np
from scipy.sparse import csc_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import fastparquet
import math

In [2]:
def load_data():
    
    ### processed data from recommendation data exploration.ipynb
    # movie attributes
    df = pd.read_parquet('movies_processed.parq')
    # strip year out of title. Match on ( followed by number. () sometimes valid part of title
    df['title'] = df.title_eng.apply(lambda row: re.split('\\([0-9]', row)[0].strip())    
    
    # number of and average ratings by movie
    movie_ratings = pd.read_parquet('movies_ratings.parq')

    ### data to get additional attributes for display
    links = pd.read_csv('data/ml-25m/links.csv')
    imdb_movies = pd.read_csv('data/imdb/IMDb movies.csv')

    ### genome tags for filtering
    tags = pd.read_csv('data/ml-25m/genome-tags.csv')
    relevance = pd.read_csv('data/ml-25m/genome-scores.csv')
    
    return df, movie_ratings, links, imdb_movies, tags, relevance

## Weighted Average Ratings
- \# of ratings * average rating    
    - If just use average rating, many movies only reviewed once or twice. Want highly rated, frequently watched movies   
    - This does weight poorly rated, but frequently watched movies higher place than well rated, infrequently movies
- Uses:
    - Primary sort key for non user filter recommendations
    - Secondary sort key for same cosine similarity in user-item and item-item recommendations

In [3]:
def weighted_avg(movie_ratings, df):
    # calculate weighted average
    movie_ratings['weighted_avg'] = movie_ratings.avg * movie_ratings.cnt

    # merge with df with movie attributes
    # LEFT merge so keep movies with no ratings (weighted avg = 0) -- can still recommend if fit specific filters
    df = pd.merge(df, movie_ratings[['movieId', 'weighted_avg', 'cnt', 'avg']], on = 'movieId', how = 'left')

    # replace nulls to 0 
    # NOT avg: should display missing if missing
    for var in ['weighted_avg', 'cnt']:
        df[var] = np.where(df[var].isnull(), 0, df[var])
        
    return df

## Merge in additional IMDB attributes

In [4]:
def imdb_merge(imdb_movies, links, df):
    # standardize IMDB IDs
    imdb_movies['imdbId'] = imdb_movies.imdb_title_id.str.split('tt').str[1]
    imdb_movies.imdbId = pd.to_numeric(imdb_movies.imdbId)
    
    x = len(df)
    # merge links to identify IMDB movies
    df = pd.merge(df, links[['movieId', 'imdbId']], on = 'movieId')
    # merge specific IMDB attributes
    df = pd.merge(df, imdb_movies[['imdbId', 'description', 'language', 'duration', 'production_company']])
    assert x == len(df)
    
    return df

## Merge in Genome Tags
Limit to tags with > 75% relevant in a movie for filtering          
__Extension__: incorporate relevant score into weighted average or recommendation score when decide what to show

In [5]:
def genome_merge(tags, relevance, df):
    # merge tags and relevance scores
    tags = pd.merge(tags, relevance, on = 'tagId')

    # limit to tags > 75% relevant to a movie 
    # get list of relevant tags per movie
    tags_relevant = tags[tags.relevance > 0.75].groupby('movieId').tag.apply(list).to_frame()
    tags_relevant.columns = ['tags']

    # merge with dataframe
    # LEFT merge because want to keep movie even if doesn't have any tags
    df = pd.merge(df, tags_relevant, left_on = 'movieId', right_index = True, how = 'left')
    # replace missing tags to empty list
    df.tags = df.tags.apply(lambda d: d if isinstance(d, list) else [])
    
    return df

## Downcase User Input Variables so can match user input non case-sensitive  
Keep non-downcased version for displaying   
Filters:
- Actors
- Directors     
    
Item-Item input: 
- Titles

In [6]:
def downcasing(df):
    df['actors_downcased'] = df.actors_lst.apply(lambda row: [i.lower() for i in row])
    df['directors_downcased'] = df.director_lst.apply(lambda row: [i.lower() for i in row])
    df['title_downcased'] = df.title.apply(lambda row: row.lower())
    return df

## Create new variables

In [7]:
def new_vars(df):
    # round ratings for display (1 decimal)
    df.loc[:,'avg'] = df.loc[:,'avg'].round(1)
    df['avg'] = df['avg'].apply(str)
    
    # decade to filter by 
    def rounddown(row):
        return int(math.floor(row / 10.0)) * 10
    df['decade'] = df.year.apply(lambda row: rounddown(row))
    # convert to string for filtering with lists
    df.decade = df.decade.apply(str)
    
    # language to list
    for var in ['language']:
        df[var + '_lst'] = df[var].str.split(', ')
        df[var + '_lst'] = df[var + '_lst'].apply(lambda d: d if isinstance(d, list) else [])
    
    return df

## Set up DataFrame for Display

In [8]:
def display_dataframe(df):
    
    df_display = df.copy()
    
    # drop columns
    df_display = df_display.drop(columns = ['imdbId', 'language'])
    # rename + reorder columns
    df_display.columns = ['movieId', 'title_year', 'Year', 'Genres', 'Director(s)', 'Actors', 'Filming Countries', 'Title',
                          'weighted_avg', 'Number of Ratings', 'Average Rating', 'Description', 'Duration (Minutes)', 
                          'Production Company', 'Tags', 'actors_downcased', 'directors_downcased', 'title_downcased',
                          'decade', 'Language(s)']
    df_display = df_display[['Title', 'Year', 'Description','Duration (Minutes)', 'Genres', 'Actors', 'Director(s)', 
                             'Production Company', 'Filming Countries', 'Language(s)', 'Tags',
                             'Number of Ratings', 'Average Rating', 'weighted_avg', 'actors_downcased', 'directors_downcased',
                            'title_downcased', 'movieId', 'title_year', 'decade']]


    return df_display

# Main Function

In [9]:
def main():
    
    # load data
    df, movie_ratings, links, imdb_movies, tags, relevance = load_data()
    # calculate weighted average for sorting
    df = weighted_avg(movie_ratings, df)
    # merge in IMDB metadata and tags 
    df = imdb_merge(imdb_movies, links, df)
    df = genome_merge(tags, relevance, df)
    # downcase user input variables so match user input non case-sensitive (keep regular casing for display) 
    df = downcasing(df)
    # new vars
    df = new_vars(df)

    # format df for display
    df_display = display_dataframe(df)
    
    # save as parquet
    df_display.to_parquet('recommendation_display.parq', engine = 'fastparquet', compression = 'GZIP')
    
    return df_display 

In [None]:
df = main()