In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv("/home/l/Documents/archive/ratings.csv")

In [3]:
movie_metadata = pd.read_csv("/home/l/Documents/archive/movies.csv", low_memory=False)
movie_metadata

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),Action|Drama
86533,288971,Ouija Japan (2021),Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),Documentary
86535,288977,Skinford: Death Sentence (2023),Crime|Thriller


In [6]:
movies_short['id'] = pd.to_numeric(movies_short['id'], errors='coerce')

KeyError: 'id'

In [None]:
movies_short['id'] = movies_short['id'].astype(float)

In [35]:
movies_short.dtypes

id                float64
original_title     object
genres             object
dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_short['original_title'])

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def search(original_title):
    query_vec = vectorizer.transform([original_title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_short.iloc[indices][::-1]
    return results



In [11]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        original_title = data["new"]
        if len(original_title)>5:
            display(search(original_title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [12]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [13]:
movie_id = 862

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >4)]["userId"].unique()

In [14]:
similar_users

array([  3951,   6608,  10555,  13874,  23274,  24174,  24836,  25571,
        26747,  27403,  27918,  28885,  29708,  31002,  32505,  32579,
        33472,  36596,  36955,  45127,  45476,  45511,  48646,  54881,
        56963,  57904,  63653,  63987,  66214,  67730,  71282,  76392,
        76975,  82624,  85617,  88539,  93717,  96555,  98400,  98439,
        98961, 105033, 105946, 109165, 109646, 110199, 111230, 111562,
       113686, 115844, 119090, 121314, 121783, 124160, 126594, 127697,
       129818, 137780, 137934, 139457, 141119, 148162, 153175, 155513,
       156292, 156868, 157021, 158341])

In [15]:
similar_users_recomendations = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [16]:
similar_users_recomendations

579973       112
579977       318
579979       376
579983       497
579987       590
            ... 
24372276    1404
24372283    1466
24372287    1535
24372291    1639
24372292    5060
Name: movieId, Length: 7035, dtype: int64

In [17]:
similar_users_recomendations = similar_users_recomendations.value_counts() / len(similar_users)

In [18]:
similar_users_recomendations = similar_users_recomendations[similar_users_recomendations > .1]

In [19]:
similar_users_recomendations

movieId
862     1.000000
608     0.441176
318     0.397059
593     0.323529
296     0.308824
          ...   
1641    0.102941
1968    0.102941
1719    0.102941
627     0.102941
67      0.102941
Name: count, Length: 194, dtype: float64

In [20]:
all_users = ratings [(ratings["movieId"].isin(similar_users_recomendations.index)) & (ratings["rating"] > 4)]

In [21]:
all_users #all users who watched our movie

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
19,1,2692,5.0,1147869100
41,1,6711,5.0,1147868622
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
...,...,...,...,...
25000018,162541,2858,5.0,1240950804
25000020,162541,2959,5.0,1240953488
25000022,162541,2997,5.0,1240952040
25000057,162541,4993,5.0,1240952610


In [22]:
all_users_recomendations = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [23]:
all_users_recomendations

movieId
318     0.343136
296     0.285435
2571    0.244686
356     0.235895
593     0.226513
          ...   
889     0.000246
1002    0.000239
630     0.000232
1430    0.000212
67      0.000199
Name: count, Length: 194, dtype: float64

In [24]:
recomendation_percentages = pd.concat([similar_users_recomendations, all_users_recomendations], axis = 1)


In [25]:
recomendation_percentages.columns = ["similar", "all"]

In [26]:
recomendation_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
862,1.000000,0.000452
608,0.441176,0.144683
318,0.397059,0.343136
593,0.323529,0.226513
296,0.308824,0.285435
...,...,...
1641,0.102941,0.021706
1968,0.102941,0.046253
1719,0.102941,0.008592
627,0.102941,0.001985


In [27]:
recomendation_percentages["score"] = recomendation_percentages["similar"] / recomendation_percentages["all"]

In [28]:
recomendation_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
862,1.000000,0.000452,2214.779412
608,0.441176,0.144683,3.049260
318,0.397059,0.343136,1.157147
593,0.323529,0.226513,1.428304
296,0.308824,0.285435,1.081938
...,...,...,...
1641,0.102941,0.021706,4.742568
1968,0.102941,0.046253,2.225589
1719,0.102941,0.008592,11.981032
627,0.102941,0.001985,51.851023


In [29]:
recomendation_percentages = recomendation_percentages.sort_values("score", ascending = False)

In [30]:
recomendation_percentages.head(200).merge(movies_short, left_on = "movieId", right_on = "id")

Unnamed: 0,similar,all,score,id,original_title,genres
0,1.000000,0.000452,2214.779412,862.0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,0.102941,0.000199,516.781863,67.0,Paradise Now,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n..."
2,0.117647,0.000232,506.235294,630.0,The Wizard of Oz,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751..."
3,0.220588,0.000445,495.846137,860.0,WarGames,"[{'id': 53, 'name': 'Thriller'}, {'id': 878, '..."
4,0.102941,0.000212,484.482996,1430.0,Bowling for Columbine,"[{'id': 99, 'name': 'Documentary'}]"
...,...,...,...,...,...,...
103,0.102941,0.118615,0.867860,2762.0,Young and Innocent,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."
104,0.102941,0.131888,0.780519,2028.0,Say Anything...,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
105,0.117647,0.161303,0.729356,110.0,Trois couleurs : Rouge,"[{'id': 18, 'name': 'Drama'}, {'id': 9648, 'na..."
106,0.102941,0.188015,0.547516,4993.0,5 Card Stud,"[{'id': 28, 'name': 'Action'}, {'id': 37, 'nam..."
