In [1]:
import numpy as np 
import pandas as pd 
import re

In [2]:
movies  = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/movies.csv')
tags = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/tags.csv')
ratings = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/ratings.csv')
genome_score = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/genome-scores.csv')
genome_tags = pd.read_csv('/kaggle/input/recommendation-system/ml-25m/genome-tags.csv')


In [3]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
movies.shape

(62423, 3)

In [5]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256
2,4,1732,dark comedy,1573943598


In [6]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [7]:
ratings.shape

(25000095, 4)

In [8]:
genome_tags.head(3)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century


In [9]:
genome_tags = genome_tags[~genome_tags['tag'].isin(['original', 'sequel', 'good sequel','sequels'])]


In [10]:
genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [11]:
genome_score.head(3)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625


* Group the data by 'movieId' and apply a lambda function to get the top 2 tags for each movie
* Convert the result to a dataframe

In [12]:
merged = pd.merge(genome_score, genome_tags, on='tagId')

# Group the data by 'movieId' and apply a lambda function to get the top 2 tags for each movie
top_tags = merged.groupby('movieId').apply(lambda x: x.nlargest(5, 'relevance')['tag'].tolist())


top_tags_df = top_tags.reset_index(name='top_relevance')

In [13]:
top_tags_df

Unnamed: 0,movieId,top_relevance
0,1,"[toys, computer animation, pixar animation, ki..."
1,2,"[adventure, children, fantasy, kids, special e..."
2,3,"[comedy, gunfight, romance, destiny, great]"
3,4,"[women, chick flick, divorce, girlie movie, ro..."
4,5,"[father daughter relationship, pregnancy, midl..."
...,...,...
13811,205072,"[dumb but funny, friendship, runaway, great mo..."
13812,205076,"[girlie movie, light, feel-good, oscar (best w..."
13813,205383,"[chase, suspense, clever, drama, great ending]"
13814,205425,"[stand-up comedy, comedy, highly quotable, ver..."


In [14]:
movies = pd.merge(top_tags_df, movies[['movieId', 'title','genres']], on='movieId')
movies = movies.fillna(' ')


In [15]:
movies

Unnamed: 0,movieId,top_relevance,title,genres
0,1,"[toys, computer animation, pixar animation, ki...",Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,"[adventure, children, fantasy, kids, special e...",Jumanji (1995),Adventure|Children|Fantasy
2,3,"[comedy, gunfight, romance, destiny, great]",Grumpier Old Men (1995),Comedy|Romance
3,4,"[women, chick flick, divorce, girlie movie, ro...",Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,"[father daughter relationship, pregnancy, midl...",Father of the Bride Part II (1995),Comedy
...,...,...,...,...
13811,205072,"[dumb but funny, friendship, runaway, great mo...",Zombieland: Double Tap (2019),Action|Comedy|Horror
13812,205076,"[girlie movie, light, feel-good, oscar (best w...",Downton Abbey (2019),Drama
13813,205383,"[chase, suspense, clever, drama, great ending]",El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller
13814,205425,"[stand-up comedy, comedy, highly quotable, ver...",Dave Chappelle: Sticks & Stones (2019),Comedy


In [16]:
movies['genres'] = movies['genres'].apply(lambda x: re.sub(r'\|', ' ', x).lower())
movies['top_relevance'] = movies['top_relevance'].apply(lambda x: ' '.join(x).replace('-',' '))

In [17]:
movies

Unnamed: 0,movieId,top_relevance,title,genres
0,1,toys computer animation pixar animation kids a...,Toy Story (1995),adventure animation children comedy fantasy
1,2,adventure children fantasy kids special effects,Jumanji (1995),adventure children fantasy
2,3,comedy gunfight romance destiny great,Grumpier Old Men (1995),comedy romance
3,4,women chick flick divorce girlie movie romantic,Waiting to Exhale (1995),comedy drama romance
4,5,father daughter relationship pregnancy midlife...,Father of the Bride Part II (1995),comedy
...,...,...,...,...
13811,205072,dumb but funny friendship runaway great movie ...,Zombieland: Double Tap (2019),action comedy horror
13812,205076,girlie movie light feel good oscar (best writi...,Downton Abbey (2019),drama
13813,205383,chase suspense clever drama great ending,El Camino: A Breaking Bad Movie (2019),crime drama thriller
13814,205425,stand up comedy comedy highly quotable very fu...,Dave Chappelle: Sticks & Stones (2019),comedy


In [18]:
movies[movies['title'] == 'Zombieland: Double Tap (2019)']

Unnamed: 0,movieId,top_relevance,title,genres
13811,205072,dumb but funny friendship runaway great movie ...,Zombieland: Double Tap (2019),action comedy horror


In [19]:
movies['combine_relevant'] = movies['top_relevance']+" "+movies['genres']

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

counter_vec = CountVectorizer(stop_words='english',max_features=1500)
counter_vec.fit(movies['combine_relevant'])

gerne_vec_tags = counter_vec.transform(movies['top_relevance'])*0.3 # change the importance
gerne_vec_geners = counter_vec.transform(movies['genres'])*1 

gerne_vec = gerne_vec_tags+gerne_vec_geners


cos_similar = cosine_similarity(gerne_vec,gerne_vec)



In [21]:
cos_similar.shape

(13816, 13816)

In [22]:
movie_title_series = pd.Series(movies.index,movies['title'])

def get_recommend(title,cosine_sim = cos_similar):
    movie_name = movie_title_series[title]
    
    
    sim_scores = list(enumerate(cosine_sim[movie_name]))


    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

In [23]:
get_recommend('Zombieland: Double Tap (2019)')

9966                                Zombieland (2009)
11768                               Zombeavers (2014)
12548    Scouts Guide to the Zombie Apocalypse (2015)
2934                  Buffy the Vampire Slayer (1992)
11957               Dead Snow 2: Red vs. Dead (2014) 
13476                               Game Night (2018)
6779       Citizen Toxie: The Toxic Avenger IV (2000)
6543                                    Versus (2000)
8682                                     Feast (2005)
1923                       Surf Nazis Must Die (1987)
Name: title, dtype: object

In [24]:
movies[movies['title']=='Father of the Bride Part II (1995)']

Unnamed: 0,movieId,top_relevance,title,genres,combine_relevant
4,5,father daughter relationship pregnancy midlife...,Father of the Bride Part II (1995),comedy,father daughter relationship pregnancy midlife...


In [25]:
user_list_movie=['Zombieland: Double Tap (2019)','Zombieland (2009)','Shaun of the Dead (2004)','Cockneys vs Zombies (2012)']

In [26]:
def get_recommend_by_user_list(user_list):
    feature_names = list(counter_vec.vocabulary_.keys())
    user_vec = np.zeros((1, len(feature_names)))
    for movie_name in user_list:
        vector_tmp = counter_vec.transform( movies[movies['title'] == movie_name]['combine_relevant'])
        user_vec+=vector_tmp.toarray()

    

    cosine = cosine_similarity(gerne_vec,user_vec)

    sim_scores = list(enumerate(cosine))
    

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[0:len(user_list)+11]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]
    
    

    

In [27]:
get_recommend_by_user_list(user_list_movie)

11258    Juan of the Dead (Juan de los Muertos) (2011)
11210                       Cockneys vs Zombies (2012)
9966                                 Zombieland (2009)
12548     Scouts Guide to the Zombie Apocalypse (2015)
7098                          Shaun of the Dead (2004)
5249               Chopper Chicks in Zombietown (1989)
4075          Return of the Living Dead Part II (1988)
10053                                  Doghouse (2009)
12563                                   Cooties (2015)
10499                     Tucker & Dale vs Evil (2010)
11957                Dead Snow 2: Red vs. Dead (2014) 
10898                             Revenant, The (2009)
2327                                 Idle Hands (1999)
13811                    Zombieland: Double Tap (2019)
13364                               Little Evil (2017)
Name: title, dtype: object