In [1]:
import os
import pandas as pd
import numpy as np
import re
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler


In [2]:
streamlit_dir = "C:\\Users\\haris\\OneDrive\\Desktop\\rec_streamlit"
os.makedirs(streamlit_dir, exist_ok=True)
data_dir = os.path.join(streamlit_dir, "data")
os.makedirs(data_dir, exist_ok=True)

In [3]:
df_sample = pd.read_csv("df_sample.csv")
extra_values = pd.read_csv("extra_values.csv")

In [4]:
top_movies = df_sample['movieId'].value_counts().head(2000).index
df_small = df_sample[df_sample['movieId'].isin(top_movies)].copy()

In [5]:
valid_tmdb_ids = set(extra_values['tmdbId'])
df_small = df_small[df_small['tmdbId'].isin(valid_tmdb_ids)].copy()
df_small['tmdbId'] = df_small['tmdbId'].astype(int)
extra_values['tmdbId'] = extra_values['tmdbId'].astype(int)
extra_values_small = extra_values.copy()

In [6]:
df_small.to_csv(os.path.join(streamlit_dir, "df_small.csv"), index=False)
extra_values_small.to_csv(os.path.join(streamlit_dir, "extra_values_small.csv"), index=False)

In [7]:
df_small = pd.read_csv(os.path.join(streamlit_dir, "df_small.csv"))
extra_values_small = pd.read_csv(os.path.join(streamlit_dir, "extra_values_small.csv"))

In [8]:
movieId_to_tmdbId = df_small.drop_duplicates('movieId').set_index('movieId')['tmdbId'].to_dict()
tmdbId_to_movieId = df_small.drop_duplicates('tmdbId').set_index('tmdbId')['movieId'].to_dict()

In [9]:
genres = df_small[['tmdbId', 'genres']].drop_duplicates()
extra_values_small = extra_values_small.merge(genres, on='tmdbId', how='left')
extra_values_small['genres'] = extra_values_small['genres'].fillna("").astype(str).str.replace(r'[\[\]\']', '', regex=True)
extra_values_small['movieId'] = extra_values_small['tmdbId'].map(tmdbId_to_movieId)

In [10]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    return re.sub(r"\s+", " ", text).strip()

def clean_genres(text):
    text = re.sub(r'[\[\]\'\"]', '', str(text)).lower()
    return text.replace(',', ' ').strip()

extra_values_small['title'] = extra_values_small['title'].fillna('').apply(clean_text)
extra_values_small['description'] = extra_values_small['description'].fillna('').apply(clean_text)
extra_values_small['genres'] = extra_values_small['genres'].fillna('').apply(clean_genres)
for col in ['director', 'actor1', 'actor2', 'actor3']:
    extra_values_small[col] = extra_values_small[col].fillna('').apply(clean_text)


In [11]:
def create_text_soup(row):
    return ' '.join([
        (row['title'] + ' ') * 3,
        (row['genres'] + ' ') * 3,
        (row['director'] + ' ') * 2,
        row['description'],
        (row['actor1'] + ' ') * 2,
        row['actor2'],
        row['actor3']
    ])

shared_tmdb_ids = df_small['tmdbId'].unique()
extra_values_filtered = extra_values_small[extra_values_small['tmdbId'].isin(shared_tmdb_ids)].drop_duplicates('tmdbId').reset_index(drop=True)
extra_values_filtered['text_soup'] = extra_values_filtered.apply(create_text_soup, axis=1)

In [12]:
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words='english',
    strip_accents='unicode'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(extra_values_filtered['text_soup'])
tfidf_index = pd.Series(extra_values_filtered.index, index=extra_values_filtered['tmdbId']).drop_duplicates()

In [13]:
def train_ease(df, lambda_=10.0, user_map=None, item_map=None, topk=100):
    if user_map is None:
        user_map = {uid: idx for idx, uid in enumerate(df['userId'].unique())}
    if item_map is None:
        item_map = {mid: idx for idx, mid in enumerate(df['movieId'].unique())}
    idx2item = {v: k for k, v in item_map.items()}
    item_pop = df['movieId'].value_counts()
    item_weights = 1 / np.sqrt(item_pop)
    df['ease_weight'] = df['movieId'].map(item_weights)
    rows = df['userId'].map(user_map)
    cols = df['movieId'].map(item_map)
    data = df['ease_weight'].values
    X = sp.csr_matrix((data, (rows, cols)), shape=(len(user_map), len(item_map)))
    G = X.T @ X
    G = G.toarray()
    diag = np.diag_indices_from(G)
    G[diag] += lambda_
    P = np.linalg.inv(G)
    B = P / (-np.diag(P)[:, None])
    B[diag] = 0
    if topk is not None:
        for i in range(B.shape[0]):
            row = B[i]
            if np.count_nonzero(row) > topk:
                top_k_idx = np.argpartition(row, -topk)[-topk:]
                mask = np.ones_like(row, dtype=bool)
                mask[top_k_idx] = False
                B[i, mask] = 0.0
        B = csr_matrix(B)
    return B, user_map, item_map, idx2item

In [14]:
ease_B, ease_user_map, ease_item_map, ease_idx2item = train_ease(df_small, lambda_=10, topk=100)
rows = df_small['userId'].map(ease_user_map)
cols = df_small['movieId'].map(ease_item_map)
data = df_small['ease_weight'].values
X_small = sp.csr_matrix((data, (rows, cols)), shape=(len(ease_user_map), len(ease_item_map)))
ease_all_scores = X_small @ ease_B
if sp.issparse(ease_all_scores):
    ease_all_scores = ease_all_scores.toarray()
scaler = MinMaxScaler()
ease_all_scores = scaler.fit_transform(ease_all_scores)

In [15]:
def get_hybrid_recommendations(user_id, top_n=10, weight_content=0.6):
    scores = {}

    # --- Collaborative (EASE) ---
    if user_id in ease_user_map:
        u_idx = ease_user_map[user_id]
        ease_scores = ease_all_scores[u_idx].copy()
        seen_movies = df_small[df_small['userId'] == user_id]['movieId']

        for mid in seen_movies:
            if mid in ease_item_map:
                ease_scores[ease_item_map[mid]] = 0.0

        for idx, score in enumerate(ease_scores):
            movie_id = ease_idx2item[idx]
            tmdb_id = movieId_to_tmdbId.get(movie_id)
            if tmdb_id:
                scores[tmdb_id] = (1 - weight_content) * score

    # --- Content (TF-IDF) ---
    liked_tmdb_ids = df_small[(df_small['userId'] == user_id) & (df_small['rating'] >= 4.0)]['tmdbId']
    liked_indices = [tfidf_index.get(tmdb_id) for tmdb_id in liked_tmdb_ids if tmdb_id in tfidf_index]

    tfidf_sim = np.zeros(tfidf_matrix.shape[0])
    for idx in liked_indices:
        if idx is not None:
            tfidf_sim += cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

    if liked_indices:
        tfidf_sim /= len(liked_indices)
        for idx, score in enumerate(tfidf_sim):
            tmdb_id = extra_values_filtered.iloc[idx]['tmdbId']
            scores[tmdb_id] = scores.get(tmdb_id, 0) + weight_content * score

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [tmdb for tmdb, _ in ranked[:top_n]]


In [16]:
import joblib
joblib.dump(tfidf_vectorizer, os.path.join(data_dir, "tfidf_vectorizer.pkl"))
sp.save_npz(os.path.join(data_dir, "tfidf_matrix.npz"), tfidf_matrix)
joblib.dump(ease_B, os.path.join(data_dir, "ease_B.pkl"))
joblib.dump(ease_user_map, os.path.join(data_dir, "ease_user_map.pkl"))
joblib.dump(ease_item_map, os.path.join(data_dir, "ease_item_map.pkl"))
joblib.dump(ease_idx2item, os.path.join(data_dir, "ease_idx2item.pkl"))
joblib.dump(movieId_to_tmdbId, os.path.join(data_dir, "movieId_to_tmdbId.pkl"))
joblib.dump(tfidf_index, os.path.join(data_dir, "tfidf_index.pkl"))

df_small.to_csv(os.path.join(data_dir, "df_small.csv"), index=False)
extra_values_filtered.to_csv(os.path.join(data_dir, "extra_values_filtered.csv"), index=False)