In [1]:
#Import dependencies
import pandas as pd
import numpy as np

In [2]:
#Load data

imdb = pd.read_csv("imdb_data.csv")
movies = pd.read_csv("movies.csv")

In [3]:
display(movies.head())
display(imdb.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion


# Data preprocessing
Create a function to create a meta data column for Content-Based Filtering

In [4]:
#Data preprocessing function

def word_bank_maker(movies_df=movies, imdb_df=imdb):
    df = pd.merge(movies_df,imdb_df, on = 'movieId')
    df.drop(columns=['runtime', 'budget'], axis=1, inplace=True)
    
    #Ensure all datatypes are strings

    cols = ['title_cast', 'plot_keywords', 'genres', 'director']
    for col in cols:
        df[col] = df[col].astype(str)

    #Concatenate the names in the director and title_cast columns

    df.director = df.director.apply(lambda name: "".join(name.lower() for name in name.split()))
    df.title_cast = df.title_cast.apply(lambda name: "".join(name.lower() for name in name.split()))

    #Clean the rows of any special characters (|) and then fix the title cast column

    df.title_cast = df.title_cast.map(lambda x: x.split('|')[:5])
    df.title_cast = df.title_cast.apply(lambda x: " ".join(x))

    #Clean the plot keywords the same way, retrieving the first five words again

    df.plot_keywords= df.plot_keywords.map(lambda keyword: keyword.split('|')[:5])
    df.plot_keywords = df.plot_keywords.apply(lambda keyword: " ".join(keyword))

    #Cleaning the genres column

    df.genres = df.genres.map(lambda word: word.lower().split('|'))
    df.genres = df.genres.apply(lambda word: " ".join(word))
    
    #Merge the columns for our vectorizer

    df['word_bank'] = ''
    word_bank = []
 
    cols = ['title_cast', 'director', 'plot_keywords', 'genres']

    #Generate the word_bank: ie. a list of words to feed into the vectorizer

    for row in range(len(df)):
        string_ = ''
        for col in cols:
            string_ += df.iloc[row][col] + " "        
        word_bank.append(string_)

    #Append wordbank list as a column to dataframe

    df['word_bank'] = word_bank

    df.set_index('movieId', inplace=True)

    #Drop the columns

    df.drop(columns=['title_cast', 'director', 'plot_keywords', 'genres'], inplace=True)

    #View the data

    return df

In [8]:
#Create a df with the function

df = word_bank_maker()

In [9]:
df.head()

Unnamed: 0_level_0,title,word_bank
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),tomhanks timallen donrickles jimvarney wallace...
2,Jumanji (1995),robinwilliams jonathanhyde kirstendunst bradle...
3,Grumpier Old Men (1995),waltermatthau jacklemmon sophialoren ann-margr...
4,Waiting to Exhale (1995),whitneyhouston angelabassett lorettadevine lel...
5,Father of the Bride Part II (1995),stevemartin dianekeaton martinshort kimberlywi...


### Content_Based Filtering

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
#Vectorizer the wordbank column

vectorizer = CountVectorizer()
vect_matrix = vectorizer.fit_transform(df.word_bank)

In [None]:
#Find similarity between movies

cos_sim = cosine_similarity(vect_matrix, vect_matrix)

In [None]:
def content_model(title, n=11):
   
    #Reset the index has been reset

    new_df = df.reset_index()
    
    # Extract the movie titles

    titles_list = new_df['title']

    #A 1-dimensional array with movie titles

    indices = pd.Series(new_df.index, index = df['title'])
    
    title_index = indices[title]
    
    #Get the similarity scores of the top n movies most similar to the user input

    sim_scores = list(enumerate(cos_sim[title_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n]
    
    # Exctract the titles of the top n most similar movies  
    movie_index = [i[0] for i in sim_scores]
    
    return titles_list.iloc[movie_index]

### Test the model

In [None]:
content_model('Toy Story (1995)')

In [None]:
content_model('Jumanji (1995)')

In [None]:
content_model('Ice Age (2002)')