In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#load csv file
file_path="Entertainment.csv"
data=pd.read_csv(file_path)
data.head()

Unnamed: 0,Id,Titles,Category,Reviews
0,6973,Toy Story (1995),"Drama, Romance, School, Supernatural",-8.98
1,6778,Jumanji (1995),"Action, Adventure, Drama, Fantasy, Magic, Mili...",8.88
2,9702,Grumpier Old Men (1995),"Action, Comedy, Historical, Parody, Samurai, S...",99.0
3,6769,Waiting to Exhale (1995),"Sci-Fi, Thriller",99.0
4,1123,Father of the Bride Part II (1995),"Action, Comedy, Historical, Parody, Samurai, S...",-0.44


In [3]:
#step1: Preprocess the "category" column using TF-IDF
tfidf=TfidfVectorizer(stop_words='english') #Remove common stopwords
tfidf_matrix=tfidf.fit_transform(data['Category']) #fit and transform the catgory data
tfidf,tfidf_matrix

(TfidfVectorizer(stop_words='english'),
 <51x34 sparse matrix of type '<class 'numpy.float64'>'
 	with 285 stored elements in Compressed Sparse Row format>)

In [4]:
#step 2: Compute the cosine similarity between titles
cosine_sim=cosine_similarity(tfidf_matrix,tfidf_matrix)
cosine_sim

array([[1.        , 0.09421367, 0.        , ..., 0.12767481, 0.16772551,
        0.31295101],
       [0.09421367, 1.        , 0.16662513, ..., 0.22332745, 0.        ,
        0.        ],
       [0.        , 0.16662513, 1.        , ..., 0.13383076, 0.        ,
        0.        ],
       ...,
       [0.12767481, 0.22332745, 0.13383076, ..., 1.        , 0.47083158,
        0.17020003],
       [0.16772551, 0.        , 0.        , ..., 0.47083158, 1.        ,
        0.64107498],
       [0.31295101, 0.        , 0.        , ..., 0.17020003, 0.64107498,
        1.        ]])

In [30]:
#step 3 : create function to recommend title based on similarity
def get_recommendations(title,cosine_sim=cosine_sim): 
    #get the index of the title that matches the input 
    idx= data[data['Titles']==title].index[0]

    #Get the pairwise similaritry scores of all titles with that tiles
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores
    #sort the titles based on the similarity scores in descending order
    sim_scores=sorted(sim_scores,key=lambda x: x[1],reverse=True)   
    #get the indeices of most similar titles
    sim_indices=[i[0] for i in sim_scores[1:6]]
    #Return the top 5 most similar titles
    return data['Titles'].iloc[sim_indices]

'''
    data['Titles']==title
This creates a boolean mask (a series of True and False values)
indicating which rows in the Titles column match the input title. For example, if the title is "Toy Story (1995)", this comparison results in something Like:
0  : True
1  :False
2  :False
..................
    Name: Titles, dtype: bool
    
    Why [0] is Needed:
                     
                     Even though the title should be unique, data[data["Titles'] = title).
                     index still returns an array (or List-Like object), because there is 
                     always a possibility (in general) that multiple rows might match the condition
                     (e.g., duplicate titles). By using [0],you are explicitly choosing the first matching index.

                     If you don't use [0],
                     the code will return the entire list of indices,
                     which can cause issues when you pass it to other 5 parts of the code that expect a single index
                     (Like when accessing the cosine similarity matrix)
    '''
    
    
    

'\n    data[\'Titles\']==title\nThis creates a boolean mask (a series of True and False values)\nindicating which rows in the Titles column match the input title. For example, if the title is "Toy Story (1995)", this comparison results in something Like:\n0  : True\n1  :False\n2  :False\n..................\n    Name: Titles, dtype: bool\n    \n    Why [0] is Needed:\n                     \n                     Even though the title should be unique, data[data["Titles\'] = title).\n                     index still returns an array (or List-Like object), because there is \n                     always a possibility (in general) that multiple rows might match the condition\n                     (e.g., duplicate titles). By using [0],you are explicitly choosing the first matching index.\n\n                     If you don\'t use [0],\n                     the code will return the entire list of indices,\n                     which can cause issues when you pass it to other 5 parts of the cod

In [44]:
#Test the recommendation system with an example
example="Toy Story (1995)"
recommended_titles =get_recommendations(example)

In [50]:
#print the recommendation
print(f"Recommendation for'{example_title}':")
for title in recommended_titles:
    print(title)

Recommendation for'Toy Story(1995)':
Othello (1995)
Sense and Sensibility (1995)
Dracula: Dead and Loving It (1995)
American President, The (1995)
When Night Is Falling (1995)


In [52]:
get_recommendations("Father of the Bride Part II (1995)")

4     Father of the Bride Part II (1995)
8                    Sudden Death (1995)
9                       GoldenEye (1995)
12                          Balto (1995)
41                     To Die For (1995)
Name: Titles, dtype: object