In [59]:
import pandas as pd
df = pd.read_csv(r'C:\Users\Desktop\Project\Python\movies.csv')
print(df)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [60]:
#title = re.sub("[^a-zA-Z0-9 ]", "", title): This line uses the re.sub() function 
#to substitute (replace) any characters in the title string that are not letters (both uppercase and lowercase), digits, or 
#spaces with an empty string "". The regular expression [^a-zA-Z0-9 ] matches any character that is not in the range of lowercase letters (a-z), 
#uppercase letters (A-Z), digits (0-9), or a space
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title
    

In [61]:
# Created a new coloumn with the title Clean_Titles by 
#cleaning the movie titles in the "title" coloum the the clean_title function that was just made
df["Clean_Titles"]=df["title"].apply(clean_title)
print(df)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  \
0      Adventure|Animation|Children|Comedy|Fantasy   
1                       Adventure|Children|Fantasy   
2                                   Comedy|Romance   
3                             Comedy|Drama|Romance   
4                                           Comedy   
...                        

In [62]:
# from the sklearn machine learning library we use the TfidfVectorizer class which turns the titles into vector 
from sklearn.feature_extraction.text import TfidfVectorizer
#initializing the class taht will create a vector that would be used to calculate similarity scores
#ngram parameter will look at groups of 2 consecutive words as well as 1 word making search more accurate
vectorizer = TfidfVectorizer(ngram_range=(1,2))

#fit_transform() method fits the vectorizer to the data and transforms the data into a TF-IDF matrix in a single step. This matrix represents each 
#movie title as a vector of TF-IDF features, where each feature corresponds to a word or word combination (unigram or bigram) present in the
#movie titles.

#The resulting tfidf variable contains the TF-IDF matrix, where each row represents a movie title and each column represents a TF-IDF feature. 
#This matrix is typically sparse, meaning that most of its entries are zero because not all words or combinations appear in all titles.
tfidf = vectorizer.fit_transform(df["Clean_Titles"])
print(tfidf)

  (0, 138180)	0.5609151642422612
  (0, 153617)	0.5236464902527855
  (0, 763)	0.2947573407787223
  (0, 138134)	0.30818287987354687
  (0, 153609)	0.4788631896261391
  (1, 76516)	0.679914841526996
  (1, 76515)	0.6556226145512709
  (1, 763)	0.3284429867728573
  (2, 93339)	0.4587178998289233
  (2, 107075)	0.4026827592738571
  (2, 61532)	0.4587178998289233
  (2, 93306)	0.2658829644982531
  (2, 107020)	0.2945915056134832
  (2, 61531)	0.4587178998289233
  (2, 763)	0.22159051090518359
  (3, 47815)	0.4482553482876628
  (3, 151964)	0.4482553482876628
  (3, 161363)	0.4482553482876628
  (3, 47814)	0.4482553482876628
  (3, 151795)	0.1883000782500215
  (3, 161345)	0.33752574781287953
  (3, 763)	0.21653641961669368
  (4, 70008)	0.39452077294643884
  (4, 111066)	0.3080717668800027
  (4, 20729)	0.4091386155137103
  :	:
  (62419, 135163)	0.335141385640017
  (62419, 842)	0.21664294220561653
  (62419, 165119)	0.3842738783112516
  (62419, 106460)	0.1903879435867097
  (62419, 143735)	0.09431759289399541
  (6

In [63]:
#imports two libraries: sklearn.metrics.pairwise and numpy
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title) #clean the title
    query_vec = vectorizer.transform([title]) #turns into the number value
    
    #comparing query_vec values to tfidf values (dataset titles) 
    #.flatten(): This function is used to convert the result of cosine_similarity into a 1-dimensional array. The 
    #result of cosine_similarity would typically be a matrix where each row corresponds to the cosine similarity between 
    #the query vector and each document in the TF-IDF matrix. By using flatten(), the resulting array collapses these similarities 
    #into a single 1-dimensional array, which is easier to work with and interpret
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    #finding the indicies of top 5 similar titles
    #np.argpartition(similarity, -5): This function partitions the similarity array such that the smallest k elements are moved to
    #the first k positions, and the rest are moved to the remaining positions. However, the elements are not fully sorted. Instead, 
    #the elements smaller than the k-th smallest element are placed before it, and the elements larger than the k-th smallest element are 
    #placed after it. In this case, -5 is used as k, meaning the function partitions the array so that the smallest 5 elements are moved to 
    #the first 5 positions.
    #[-5:]: This slicing operation extracts the last 5 elements from the result obtained in the previous step. These last 5 elements are the
    #indices of the top 5 elements in the similarity array after partitioning.
    indices = np.argpartition(similarity, -5)[-5:]

    #this gives the title of those indices. [::-1] puts the most similar result at the top of thr list by reversing the result
    results = df.iloc[indices].iloc[::-1]
    
    return results

#movie_input = widgets.Text(value='Toy Story', description='Movie Title:', disabled=False): This line creates a
#text input widget using the Text class from the widgets module of the ipywidgets library. The value parameter sets the 
#initial value of the text input to "Toy Story". The description parameter sets the label or description of the text input field to
#"Movie Title:". The disabled parameter is set to False, meaning the input field is enabled and can be edited by the user.

#movie_list = widgets.Output(): This line creates an output widget using the Output class from the widgets module. Output 
#widgets are used to display output or results in the Jupyter Notebook interface. This widget will be used to display the list of 
#movies or any other relevant information.

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False)
movie_list = widgets.Output()

def on_type(data):  #this function will be called when we type something in the search engine 
    with movie_list:  #
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 4:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)


Text(value='Toy Story', description='Movie Title:')

Output()

In [64]:
ratings = pd.read_csv(r'C:\Users\Desktop\Project\Python\ratings.csv')
print(ratings)


          userId  movieId  rating   timestamp
0              1      296     5.0  1147880044
1              1      306     3.5  1147868817
2              1      307     5.0  1147868828
3              1      665     5.0  1147878820
4              1      899     3.5  1147868510
...          ...      ...     ...         ...
25000090  162541    50872     4.5  1240953372
25000091  162541    55768     2.5  1240951998
25000092  162541    56176     2.0  1240950697
25000093  162541    58559     4.0  1240953434
25000094  162541    63876     5.0  1240952515

[25000095 rows x 4 columns]


In [65]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [66]:
movie_ID=1
similar_users = ratings[(ratings["movieId"] == movie_ID) & (ratings["rating"] > 4)]["userId"].unique()
#Looking users who like the same movie (movie_ID) as us
#(ratings["movieId"] == movie_ID): looking for anyone that watched the movie (movie_ID) with a specifice movie_ID and 
#(ratings["rating"] > 4): looks for those who gave a rating higher than 4 

In [67]:
similar_user_other = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
#(ratings["userId"].isin(similar_users): to find other movies that people who gave specific movie_ID a 4 or greater that have a rating of 4
print(similar_user_other)

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64


In [68]:
similar_user_other=similar_user_other.value_counts()/len(similar_users)
similar_user_other = similar_user_other[similar_user_other > .10]
print(similar_user_other)

movieId
1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: count, Length: 113, dtype: float64


In [69]:
all_users = ratings[(ratings["movieId"].isin(similar_user_other.index)) & (ratings["rating"] > 4)]
#ratings[(ratings["movieId"].isin(similar_user_recs.index: find anyone that has rated a movie that is in our set of data similar user_other
#(ratings["rating"] > 4):find those with a rating higher than 4
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [70]:
all_user_other = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_user_other
#gives the percentage of all users who like these movies

movieId
318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: count, Length: 113, dtype: float64

In [71]:
reco_percentages = pd.concat([similar_user_other, all_user_other], axis=1)
reco_percentages.columns = ["similar person", "average person"] #comparing how much a similar person liked the movie vs average person
reco_percentages

Unnamed: 0_level_0,similar person,average person
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [73]:
reco_percentages.head(10).merge(df, left_index=True, right_on="movieId")

Unnamed: 0,similar person,average person,Difference score,movieId,title,genres,Clean_Titles
0,1.0,0.124728,0.124728,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,0.191364,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,0.226991,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,0.229672,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,0.301134,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,0.311668,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,0.31677,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,0.33432,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,0.336373,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,0.338437,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [82]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            if not results.empty:
                movie_id = results.iloc[0]["movieId"]
                display(find_similar_movies(movie_id))  

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

def find_similar_movies(movie_ID):
    similar_users = ratings[(ratings["movieId"] == movie_ID) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_other = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_other = similar_user_other.value_counts() / len(similar_users)
    similar_user_other = similar_user_other[similar_user_other > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_other.index)) & (ratings["rating"] > 4)]
    all_user_other = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    reco_percentages = pd.concat([similar_user_other, all_user_other], axis=1)

    
    reco_percentages.columns = ['similar person', 'average person']  # Corrected the column names

    # Adjust the calculation based on the actual columns you want to use
    reco_percentages["Difference score"] = reco_percentages["average person"] / reco_percentages["similar person"] 
    reco_percentages = reco_percentages.sort_values("Difference score", ascending=True)
    return reco_percentages.head(10).merge(df, left_index=True, right_on="movieId")[["Difference score", "title", "genres"]]


Text(value='Toy Story', description='Movie Title:')

Output()