In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies_df = pd.read_csv('Data/Input/movies.csv')
print(movies_df.shape)
movies_df.head()

(27278, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
userItem_df = pd.read_csv('Data/Input/tags.csv')
print(userItem_df.shape)
userItem_df.head()

(465564, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [4]:
df = pd.merge(userItem_df, movies_df, on='movieId', how='left')
df.shape

(465564, 6)

In [5]:
df = df[~df.isna()]
df.shape

(465564, 6)

In [6]:
df.drop_duplicates(subset=['userId','movieId'], inplace=True)

In [7]:
df.shape

(174844, 6)

In [8]:
df.head()

Unnamed: 0,userId,movieId,tag,timestamp,title,genres
0,18,4141,Mark Waters,1240597180,Head Over Heels (2001),Comedy|Romance
1,65,208,dark hero,1368150078,Waterworld (1995),Action|Adventure|Sci-Fi
2,65,353,dark hero,1368150079,"Crow, The (1994)",Action|Crime|Fantasy|Thriller
3,65,521,noir thriller,1368149983,Romeo Is Bleeding (1993),Crime|Thriller
4,65,592,dark hero,1368150078,Batman (1989),Action|Crime|Thriller


# Conetent Base Filtering Top-N using TF-IDF
Recommend other movies based on genre of the selected movie.

In [9]:
def remove_bars(value):
    return ' '.join(value.split('|'))
movies_df['genres'] = movies_df['genres'].apply(remove_bars)
movies_df.head()
movies_df = movies_df.iloc[0:10000,:]

In [10]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
def generate_tfidf_vector(title):
    def movie_recommendations(movie_title):
        titles = movies_df['title']
        indices = pd.Series(movies_df.index, index=movies_df['title'])
        #print(indices)
        idx = indices[movie_title]
        #print(idx)
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        #print(sim_scores)
        sim_scores = sim_scores[1:21]
        movie_indices = [i[0] for i in sim_scores]
        #print(movie_indices)
        return titles.iloc[movie_indices]

    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0, stop_words='english')
    #print(tf)
    tfidf_matrix = tf.fit_transform(movies_df['genres'])
    temp_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tf.get_feature_names())
    #print(temp_df)
    # generate cosine similarity
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


    return movie_recommendations(title)

In [12]:
generate_tfidf_vector('Toy Story (1995)').head(10)

2209                                       Antz (1998)
3027                                Toy Story 2 (1999)
3663    Adventures of Rocky and Bullwinkle, The (2000)
3922                  Emperor's New Groove, The (2000)
4790                             Monsters, Inc. (2001)
1949                        Black Cauldron, The (1985)
2032                     Lord of the Rings, The (1978)
3312             We're Back! A Dinosaur's Story (1993)
4271                  Atlantis: The Lost Empire (2001)
4424                      Land Before Time, The (1988)
Name: title, dtype: object

# Content Based Predictive Recommendation
Let us consider the case, where we have a list of 3 movies that will be displayed in cinema. We want to recommend movies via push notification, to each of our user based on it's past preferences. Moreover, keep in mind we do not recommend the movie that user has already watched. 
For this we will not be adding new movies will be using already available movies.

Let's consider these three movies Frozen (2013),John Wick (2014),Divergent (2014)

In [None]:
df[df.title.str.contains('Divergent')]['genres'].values

In [13]:
df['genres'] = df['genres'].apply(remove_bars)

In [14]:
def knowledge_base(gdf):
    return ' '.join(gdf['genres'].unique().tolist()) 
kb=df.groupby(by='userId').apply(knowledge_base).to_frame('TYPE').reset_index()

In [15]:
# compare the vectorized input movie with all the movies watched by active users
def generate_tfidf_vector(all_users_df, movie_info_dict):

    #print(movie_info_dict)
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0, stop_words='english')
    all_users_df = all_users_df.append(movie_info_dict, ignore_index=True)
    print(all_users_df.shape)

    temp_df = all_users_df

    tfidf_matrix = tf.fit_transform(temp_df['TYPE'])
    #print(tfidf_matrix)
    temp_dfm = pd.DataFrame(tfidf_matrix.toarray(), columns = tf.get_feature_names())
    #print(temp_dfm)
    #   # generate cosine similarity
    tfidf_matrix = tfidf_matrix.toarray()
    print('tfidf shape ',tfidf_matrix.shape)
    cosine_sim = linear_kernel(tfidf_matrix[tfidf_matrix.shape[0]-1:tfidf_matrix.shape[0]], tfidf_matrix)
    #print(tfidf_matrix[tfidf_matrix.shape[0]-1:tfidf_matrix.shape[0]])
    movie_df = pd.DataFrame(tfidf_matrix[tfidf_matrix.shape[0]-1:tfidf_matrix.shape[0]], columns = tf.get_feature_names())
    print('movie matrix', movie_df.shape)
    print('Cosine Similarity:', cosine_sim.shape)
    #print("CSim :", cosine_sim)
    print(type(cosine_sim))
    #print(cosine_sim.tolist()[0])
    temp_df[movie_info_dict['title']] = cosine_sim.tolist()[0]
    return temp_df


In [19]:
# Returns the dataframe containing the similarity b/w each user'movie and the provided movie
movie_names=[]

movie_genre_list=[('Frozen (2013)',  'Adventure Animation Comedy Fantasy Musical Romance'),\
                  ('John Wick (2014)',    'Action Thriller' ),\
                  ('Divergent (2014)', 'Adventure Romance Sci-Fi' )]
input_list=[]
test_list =[]
rslt = kb
print("Unique emails count:", rslt.userId.nunique())
for index,items in enumerate(movie_genre_list):
    input_data ={'userId':'TEST'+str(index),'title':items[0], 'TYPE':items[1]}
    rslt = generate_tfidf_vector(rslt, input_data)
    movie_names.append(items[0])
    test_list.append('TEST'+str(index))

rslt.head()

mask = rslt['userId'].isin(test_list)
rslt = rslt[~mask]
rslt.drop('title', inplace=True, axis=1)

Unique emails count: 7801
(7802, 3)
tfidf shape  (7802, 23)
movie matrix (1, 23)
Cosine Similarity: (1, 7802)
<class 'numpy.ndarray'>
(7803, 4)
tfidf shape  (7803, 23)
movie matrix (1, 23)
Cosine Similarity: (1, 7803)
<class 'numpy.ndarray'>
(7804, 5)
tfidf shape  (7804, 23)
movie matrix (1, 23)
Cosine Similarity: (1, 7804)
<class 'numpy.ndarray'>


In [20]:
# similarity of provided movies with each user knowledge base
rslt.head()

Unnamed: 0,userId,TYPE,Frozen (2013),John Wick (2014),Divergent (2014)
0,18,Comedy Romance,0.455748,0.000000,0.364806
1,65,Action Adventure Sci-Fi Action Crime Fantasy T...,0.339010,0.506057,0.409976
2,96,Adventure Animation Comedy Fantasy Musical Rom...,1.000000,0.000000,0.338917
3,121,Comedy Crime Drama Comedy Comedy Romance Actio...,0.318917,0.277217,0.137393
4,129,Comedy Children Comedy Comedy Romance Drama Ro...,0.383337,0.350766,0.376221
...,...,...,...,...,...
7796,138414,Crime Mystery Thriller Drama Romance Action Sc...,0.511855,0.412699,0.513972
7797,138436,Crime Mystery Thriller Documentary Action Chil...,0.374461,0.522194,0.440643
7798,138437,Animation Documentary,0.308093,0.000000,0.000000
7799,138446,Comedy Drama Fantasy Children Comedy Fantasy M...,0.604761,0.078166,0.173034


Now we have the similarity, Assign Movie Name based on the highest similarity 

In [31]:
def top_recommendations(row):
    movies_distance_list= row[2:-1]
    result = sorted(enumerate(movies_distance_list))
    result = sorted(result, key=lambda x:x[1], reverse=True)
    #print(result)
    indices=[]
    count=0
    for var in range(len(result)):
        # distance is non zero placed at 2nd index of tuple and distance is greater that thrashold
        if result[var][1]!=0 and result[var][1]>0.20:
          #print(var, result[var])
            if count>3:
                break
            indices.append(result[var][0])
            count+=1

    #print(indices)
    res_list = [movie_names[i] for i in indices]
    # print(res_list)
    if res_list:
        return res_list
    return ' '


def assign_genre(movie_names):
    recommended_movie_genre=[]
    for name in movie_names:
        data ={}
        if name==' ':
            data["v"]=' '
            recommended_movie_genre.append(data)
        else:
            data[name]= info[name]
            recommended_movie_genre.append(data)
    return recommended_movie_genre
  
rslt.loc[:,'RECOMMENDED'] = rslt.apply(lambda row: top_recommendations(row) , axis=1)
#temp_g.loc[:,'RECOMMENDED2'] = temp_g.apply(lambda row: recommended_movie(row) , axis=1)
#temp_g.loc[:,'r_genre'] = temp_g['RECOMMENDED'].apply(assign_genre)

In [32]:
rslt.head()

Unnamed: 0,userId,TYPE,Frozen (2013),John Wick (2014),Divergent (2014),RECOMMENDED
0,18,Comedy Romance,0.455748,0.0,0.364806,"[Frozen (2013), Divergent (2014)]"
1,65,Action Adventure Sci-Fi Action Crime Fantasy T...,0.33901,0.506057,0.409976,"[John Wick (2014), Divergent (2014), Frozen (2..."
2,96,Adventure Animation Comedy Fantasy Musical Rom...,1.0,0.0,0.338917,"[Frozen (2013), Divergent (2014)]"
3,121,Comedy Crime Drama Comedy Comedy Romance Actio...,0.318917,0.277217,0.137393,"[Frozen (2013), John Wick (2014)]"
4,129,Comedy Children Comedy Comedy Romance Drama Ro...,0.383337,0.350766,0.376221,"[Frozen (2013), Divergent (2014), John Wick (2..."
