In [1]:
import pandas as pd
import numpy as np
import os
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
db_uri = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@localhost:5432/{os.getenv('POSTGRES_DB')}"
engine = create_engine(db_uri)
Session = sessionmaker(bind=engine)
session = Session()

In [4]:

from server.api.models import Rating
ratings = session.query(Rating).all()
ratings_df = pd.DataFrame([{'movie_id': str(rating.movie_id), 'user_id': str(rating.user_id), 
                            'rating': rating.rating} for rating in ratings])

In [5]:
ratings_df.head()

Unnamed: 0,movie_id,user_id,rating
0,37b3c9e1-a060-4721-af3c-c91cec194d86,d23d04cc-75b0-4f2a-ade3-2ebf7fc73dd2,3.0
1,26f5fb68-a562-4edc-b521-37e831b08a00,d23d04cc-75b0-4f2a-ade3-2ebf7fc73dd2,4.0
2,b6e65e63-fcbb-4eae-91e5-bb49ad43896d,d23d04cc-75b0-4f2a-ade3-2ebf7fc73dd2,3.5
3,ac5f16c9-8216-45cb-97ca-dd0a068c52e4,d23d04cc-75b0-4f2a-ade3-2ebf7fc73dd2,1.5
4,741a383f-a74a-41cc-913e-e61226a9cd9b,d23d04cc-75b0-4f2a-ade3-2ebf7fc73dd2,1.5


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

m = ratings_df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

In [7]:
m.head()

movie_id,0000522c-570a-40cd-972f-532156788b21,000d30ae-c83d-4b75-83f0-38bd0c3d92da,00154f5a-3812-44dd-9e23-e5b6fa524bc4,002534cf-df8b-49dc-9428-252f59d0b4bc,002c4ca0-2e4b-42ff-8f60-c53a9b336892,00333325-e722-404c-b7ab-1904c3039450,0036d2d5-518e-4599-a799-5009b6e0e89a,0039df16-183e-4dc1-bb76-e79f90b952ab,003a8537-2bd3-45d0-8662-2aedeeecbf39,003e5ac1-b12f-4d7d-b1a2-751bbebe2af2,...,ffd33649-87bd-4376-9cbd-789b2be477d7,ffd63548-2767-4d36-9bb1-ad5be1736570,ffd66f90-c238-49e1-90f0-91a543ca184a,ffe1fb93-e1e3-432d-b567-7c5e0314c037,ffe4c098-9da0-4051-ad0f-02d86b24e78a,ffe6f06e-b1e8-41b5-b451-8bfae63f7bd3,fff5c47a-fc35-47b2-af1c-536c479ff55b,fff964a8-4dbb-40a0-ab9b-6f7b4b87dfab,fffc1582-57ce-4812-a552-4e5f21931c7c,fffef1a3-52c9-4b70-b6c5-347202482083
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0008339e-1634-4319-ad2e-d5f5c5fa7e6d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001647e1-7fcc-4edd-b4ee-3991c0c82be2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0025386e-a818-4bc0-896d-69bf9237c277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0
002d477a-03fe-4c60-95be-7b981c2446d7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0
0050c09d-7c46-4d8f-94c8-b2b079546c41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
m_np = m.to_numpy()

k = min(50, min(m_np.shape)-1)
U, sigma, Vt = svds(m_np, k=k)
sigma = np.diag(sigma)

predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=m.columns, index=m.index)

item_similarity = cosine_similarity(Vt.T)
item_similarity_df = pd.DataFrame(item_similarity, index=m.columns, columns=m.columns)

In [9]:
def similar_movies(movie_id, n=8):
    similar_scores = item_similarity_df[movie_id].sort_values(ascending=False)
    similar_movies = similar_scores.head(n + 1).index.tolist()
    similar_movies.remove(movie_id)
    return similar_movies[:n]

In [10]:
from server.api.models import Movie
godfather_movie = session.query(Movie).filter(Movie.title == 'The Godfather').first()
sim_movie_ids = similar_movies(str(godfather_movie.id))
sim_movie_ids

['f33322e9-0575-4677-b7d4-c14e01aec398',
 '490389da-c953-4e14-9b5c-8736e79223a3',
 '8351b573-596a-4a90-b6c4-46c2e5e5443b',
 'be48509f-ad0a-451a-91e6-a06090e6a15b',
 '6a7b143c-82a5-4a95-bff3-129a5718b45f',
 '14954637-072d-47f9-b252-bfb389ece53c',
 '15eec116-bca0-4fd1-9f4e-bacba8138793',
 '5dbee8bc-1584-4de9-b00c-96d88681f9cb']

In [11]:
sim_movies = session.query(Movie).filter(Movie.id.in_(sim_movie_ids)).all()
for sim_movie in sim_movies:
    print(sim_movie.title)

Casino
Taxi Driver
Goodfellas
The Godfather: Part II
Donnie Brasco
The Godfather: Part III
Katok i skripka
War of the Worlds 2: The Next Wave


In [14]:
def recommend_for_user(user_id, n=10):
    user_idx = m.index.get_loc(user_id)
    user_ratings = predicted_ratings_df.iloc[user_idx].sort_values(ascending=False)
    watched_movies = m.loc[user_id][m.loc[user_id] > 0].index
    recommendations = user_ratings[~user_ratings.index.isin(watched_movies)].head(n)
    return recommendations.index.tolist()

In [15]:
recommend_for_user('6377859d-c152-4897-8544-eeb7c285f29d')

['4ff0d81a-c6c3-4626-84c3-7a1d0c763472',
 '0c7ac304-f08b-4c17-84e4-9ba45f1f79b3',
 '82262d8b-2430-4f12-b850-4c2f7abdf1b7',
 'fc1588b2-79b9-4742-8c58-9c6e9e65a346',
 '558fd160-40e3-4e21-9bec-f0cf7394080b',
 'c64806ce-476d-4118-9977-d1d8d573955e',
 'ac6a1876-75fb-400c-b2e0-28d850955e6c',
 '05bb30a4-6d8e-4419-b25c-b49fd22c4c91',
 '352c3f8b-0d44-48e2-9a61-b82bc8adf233',
 'b835b0cd-fb8e-4ad3-a618-1500c0b81d61']

In [16]:
import uuid
[uuid.UUID(m) for m in recommend_for_user('6377859d-c152-4897-8544-eeb7c285f29d')]

[UUID('4ff0d81a-c6c3-4626-84c3-7a1d0c763472'),
 UUID('0c7ac304-f08b-4c17-84e4-9ba45f1f79b3'),
 UUID('82262d8b-2430-4f12-b850-4c2f7abdf1b7'),
 UUID('fc1588b2-79b9-4742-8c58-9c6e9e65a346'),
 UUID('558fd160-40e3-4e21-9bec-f0cf7394080b'),
 UUID('c64806ce-476d-4118-9977-d1d8d573955e'),
 UUID('ac6a1876-75fb-400c-b2e0-28d850955e6c'),
 UUID('05bb30a4-6d8e-4419-b25c-b49fd22c4c91'),
 UUID('352c3f8b-0d44-48e2-9a61-b82bc8adf233'),
 UUID('b835b0cd-fb8e-4ad3-a618-1500c0b81d61')]