In [86]:
import os
import pandas as pd
import numpy as np
import re
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from difflib import get_close_matches
from tabulate import tabulate

In [87]:
save_dir = "C:\\Users\\haris\\OneDrive\\Desktop\\Movie_rec_full"
os.makedirs(save_dir, exist_ok=True)

In [88]:
df_full = pd.read_csv('df_sample.csv')
extra_values_full = pd.read_csv('extra_values.csv')

In [89]:
df_full.to_csv(os.path.join(save_dir, "df_full.csv"), index=False)
extra_values_full.to_csv(os.path.join(save_dir, "extra_values_full.csv"), index=False)

In [90]:
df_full = pd.read_csv(os.path.join(save_dir, "df_full.csv"))
extra_values_full = pd.read_csv(os.path.join(save_dir, "extra_values_full.csv"))

In [91]:
valid_tmdb_ids = set(extra_values_full['tmdbId'])
df_full = df_full[df_full['tmdbId'].isin(valid_tmdb_ids)].copy()
df_full['tmdbId'] = df_full['tmdbId'].astype(int)
extra_values_full['tmdbId'] = extra_values_full['tmdbId'].astype(int)

In [92]:
df_full = df_full[df_full['tmdbId'].isin(valid_tmdb_ids)].copy()

In [93]:
movieId_to_tmdbId = df_full.drop_duplicates('movieId').set_index('movieId')['tmdbId'].to_dict()
tmdbId_to_movieId = df_full.drop_duplicates('tmdbId').set_index('tmdbId')['movieId'].to_dict()

In [94]:
genres = df_full[['tmdbId', 'genres']].drop_duplicates()
extra_values_full = extra_values_full.merge(genres, on='tmdbId', how='left')
extra_values_full['genres'] = extra_values_full['genres'].fillna("").astype(str).str.replace(r'[\[\]\']', '', regex=True)
extra_values_full['movieId'] = extra_values_full['tmdbId'].map(tmdbId_to_movieId)

In [95]:
def clean_description(desc):
    desc = str(desc).lower()
    desc = re.sub(r"[^a-z0-9\s]", "", desc)
    desc = re.sub(r"\s+", " ", desc)
    return desc.strip()

def clean_title(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)
    return text.strip()

def clean_name(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def clean_genres(genres_str):
    text = re.sub(r'[\[\]\'\"]', '', str(genres_str)).lower()
    text = text.replace(',', ' ')
    return text.strip()

In [96]:
# Apply cleaning
extra_values_full['title'] = extra_values_full['title'].fillna('').apply(clean_title)
extra_values_full['description'] = extra_values_full['description'].fillna('').apply(clean_description)
extra_values_full['genres'] = extra_values_full['genres'].fillna('').apply(clean_genres)
for col in ['director', 'actor1', 'actor2', 'actor3']:
    extra_values_full[col] = extra_values_full[col].fillna('').apply(clean_name)

In [97]:
def create_text_soup(row):
    return ' '.join([
        (row['title'] + ' ') * 3,
        (row['genres'] + ' ') * 3,
        (row['director'] + ' ') * 2,
        row['description'],
        (row['actor1'] + ' ') * 2,
        row['actor2'],
        row['actor3']
    ])


In [98]:
shared_tmdb_ids = df_full['tmdbId'].unique()
extra_values_filtered = extra_values_full[extra_values_full['tmdbId'].isin(shared_tmdb_ids)].drop_duplicates('tmdbId').reset_index(drop=True)
extra_values_filtered['text_soup'] = extra_values_filtered.apply(create_text_soup, axis=1)

tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=10000,
    stop_words='english',
    strip_accents='unicode'
)
tfidf_matrix = tfidf_vectorizer.fit_transform(extra_values_filtered['text_soup'])
tfidf_index = pd.Series(extra_values_filtered.index, index=extra_values_filtered['tmdbId']).drop_duplicates()


In [99]:
def get_tfidf_recommendations(tmdb_id, top_n=10):
    if tmdb_id not in tfidf_index:
        return []
    idx = tfidf_index[tmdb_id]
    query_vec = tfidf_matrix[idx]
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    similar_indices = similarity.argsort()[::-1][1:top_n+1]
    return extra_values_filtered.iloc[similar_indices][['movieId', 'tmdbId', 'title', 'genres', 'director']]


In [100]:
# --- Test TF-IDF
toy_story_tmdbid = 862
print(get_tfidf_recommendations(toy_story_tmdbid))

      movieId  tmdbId                              title  \
1585     3114     863                        toy story 2   
7624   115879   82424          toy story toons small fry   
6799    78499   10193                        toy story 3   
8181   201588  301528                        toy story 4   
7610   115875   77887  toy story toons hawaiian vacation   
7241   120474  256835         toy story that time forgot   
6576   106022  213121                toy story of terror   
8655   166461  277834                              moana   
8853   213207  508439                             onward   
535      2294    8916                               antz   

                                                genres       director  
1585       adventure|animation|children|comedy|fantasy  john lasseter  
7624       adventure|animation|children|comedy|fantasy  angus maclane  
6799  adventure|animation|children|comedy|fantasy|imax    lee unkrich  
8181               adventure|animation|children|com

In [101]:
def train_ease(df, lambda_=10.0, user_map=None, item_map=None, topk=100):
    if user_map is None:
        user_map = {uid: idx for idx, uid in enumerate(df['userId'].unique())}
    if item_map is None:
        item_map = {mid: idx for idx, mid in enumerate(df['movieId'].unique())}
    idx2item = {v: k for k, v in item_map.items()}

    item_popularity = df['movieId'].value_counts()
    item_weights = 1 / np.sqrt(item_popularity)
    df['ease_weight'] = df['movieId'].map(item_weights)

    rows = df['userId'].map(user_map)
    cols = df['movieId'].map(item_map)
    data = df['ease_weight'].values

    X = sp.csr_matrix((data, (rows, cols)), shape=(len(user_map), len(item_map)))
    G = X.T @ X
    G = G.toarray()
    diag = np.diag_indices_from(G)
    G[diag] += lambda_
    P = np.linalg.inv(G)
    B = P / (-np.diag(P)[:, None])
    B[diag] = 0

    if topk is not None:
        for i in range(B.shape[0]):
            row = B[i]
            if np.count_nonzero(row) > topk:
                top_k_idx = np.argpartition(row, -topk)[-topk:]
                mask = np.ones_like(row, dtype=bool)
                mask[top_k_idx] = False
                B[i, mask] = 0.0
        B = csr_matrix(B)

    return B, user_map, item_map, idx2item

ease_B, ease_user_map, ease_item_map, ease_idx2item = train_ease(df_full, lambda_=10, topk=100)

rows = df_full['userId'].map(ease_user_map)
cols = df_full['movieId'].map(ease_item_map)
data = df_full['ease_weight'].values

X_full = sp.csr_matrix((data, (rows, cols)), shape=(len(ease_user_map), len(ease_item_map)))
ease_all_scores = X_full @ ease_B
if sp.issparse(ease_all_scores):
    ease_all_scores = ease_all_scores.toarray()

scaler = MinMaxScaler()
ease_all_scores = scaler.fit_transform(ease_all_scores)


In [102]:
def get_hybrid_recommendations(user_id, top_n=10, weight_content=0.6):
    scores = {}

    if user_id in ease_user_map:
        u_idx = ease_user_map[user_id]
        ease_scores = ease_all_scores[u_idx].copy()
        seen_movies = df_full[df_full['userId'] == user_id]['movieId']
        for mid in seen_movies:
            if mid in ease_item_map:
                ease_scores[ease_item_map[mid]] = 0.0
        for idx, score in enumerate(ease_scores):
            movie_id = ease_idx2item[idx]
            tmdb_id = movieId_to_tmdbId.get(movie_id)
            if tmdb_id:
                scores[tmdb_id] = (1 - weight_content) * score

    liked_tmdb_ids = df_full[(df_full['userId'] == user_id) & (df_full['rating'] >= 4.0)]['tmdbId']
    liked_indices = [tfidf_index.get(tmdb_id) for tmdb_id in liked_tmdb_ids if tmdb_id in tfidf_index]

    tfidf_sim = np.zeros(tfidf_matrix.shape[0])
    for idx in liked_indices:
        if idx is not None:
            tfidf_sim += cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

    if liked_indices:
        tfidf_sim /= len(liked_indices)
        for idx, score in enumerate(tfidf_sim):
            tmdb_id = extra_values_filtered.iloc[idx]['tmdbId']
            scores[tmdb_id] = scores.get(tmdb_id, 0) + weight_content * score

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [tmdb for tmdb, _ in ranked[:top_n]]

In [103]:
def find_closest_movie(title_input, df=extra_values_filtered):
    title_input = clean_title(title_input)
    titles = df['title'].tolist()
    matches = get_close_matches(title_input, titles, n=1, cutoff=0.5)
    if matches:
        match_title = matches[0]
        result = df[df['title'] == match_title]
        if not result.empty:
            return result.iloc[0]['tmdbId']
    return None


In [104]:
def run_recommender():
    response = input("Do you have a user ID? (yes/no): ").strip().lower()

    if response == "yes":
        try:
            user_id = int(input("Enter your user ID: ").strip())
            recs_tmdb = get_hybrid_recommendations(user_id, top_n=10)
            if recs_tmdb:
                display_df = extra_values_filtered[
                    extra_values_filtered['tmdbId'].isin(recs_tmdb)
                ][['title', 'genres', 'director']].reset_index(drop=True)
                print("\n Personalized Recommendations:")
                print(tabulate(display_df, headers='keys', tablefmt='fancy_grid'))
            else:
                print("No recommendations found for this user.")
        except ValueError:
            print("Invalid user ID format.")

    elif response == "no":
        selected_tmdb_ids = []
        while True:
            movie_title = input("Enter a movie you like (or type 'done' to finish): ").strip()
            if movie_title.lower() == "done":
                if selected_tmdb_ids:
                    break
                else:
                    print("Please enter at least one movie before finishing.")
                    continue

            tmdb_id = find_closest_movie(movie_title)
            if tmdb_id:
                selected_tmdb_ids.append(tmdb_id)
                movie_name = extra_values_filtered[extra_values_filtered['tmdbId'] == tmdb_id]['title'].values[0]
                print(f"Added: {movie_name}")
            else:
                print("Movie not found. Try again.")

        # Simulate collaborative filtering via EASE using a binary user-item vector
        liked_movie_ids = [tmdbId_to_movieId[tmdb_id] for tmdb_id in selected_tmdb_ids if tmdb_id in tmdbId_to_movieId]
        X_user = np.zeros((1, len(ease_item_map)))
        for movie_id in liked_movie_ids:
            if movie_id in ease_item_map:
                X_user[0, ease_item_map[movie_id]] = 1.0

        cf_scores = X_user @ ease_B
        if sp.issparse(cf_scores):
            cf_scores = cf_scores.toarray()
        cf_scores = MinMaxScaler().fit_transform(cf_scores.reshape(1, -1)).flatten()

        cf_score_map = {
            movieId_to_tmdbId.get(ease_idx2item[idx]): score
            for idx, score in enumerate(cf_scores)
            if movieId_to_tmdbId.get(ease_idx2item[idx]) is not None
        }

        # Content-based similarity via TF-IDF
        liked_indices = [tfidf_index.get(tmdb_id) for tmdb_id in selected_tmdb_ids if tmdb_id in tfidf_index]
        tfidf_sim = np.zeros(tfidf_matrix.shape[0])
        for idx in liked_indices:
            if idx is not None:
                tfidf_sim += cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

        if liked_indices:
            tfidf_sim /= len(liked_indices)

        # Combine CF + content scores (hybrid)
        hybrid_scores = {}
        for i in range(len(extra_values_filtered)):
            tmdb_id = extra_values_filtered.iloc[i]['tmdbId']
            content_score = tfidf_sim[i] if i < len(tfidf_sim) else 0
            cf_score = cf_score_map.get(tmdb_id, 0)
            hybrid_score = 0.4 * cf_score + 0.6 * content_score  # weights: 40% CF, 60% content
            hybrid_scores[tmdb_id] = hybrid_score

        # Exclude movies the user already entered
        ranked = sorted(
            ((tmdb, score) for tmdb, score in hybrid_scores.items() if tmdb not in selected_tmdb_ids),
            key=lambda x: x[1],
            reverse=True
        )
        rec_tmdb_ids = [tmdb for tmdb, _ in ranked[:10]]
        recs_df = extra_values_filtered[extra_values_filtered['tmdbId'].isin(rec_tmdb_ids)]

        if not recs_df.empty:
            print("\n Hybrid Recommendations Based on Your Likes:")
            print(tabulate(recs_df[['title', 'genres', 'director']].reset_index(drop=True), headers='keys', tablefmt='fancy_grid'))
        else:
            print("No valid recommendations found.")

    else:
        print("Invalid input. Please respond with 'yes' or 'no'.")


In [108]:
run_recommender()


Do you have a user ID? (yes/no): yes
Enter your user ID: 10

 Personalized Recommendations:
╒════╤════════════════════════════════════════════╤═════════════════════════════════════════╤═════════════════╕
│    │ title                                      │ genres                                  │ director        │
╞════╪════════════════════════════════════════════╪═════════════════════════════════════════╪═════════════════╡
│  0 │ xmen origins wolverine                     │ action|sci-fi|thriller                  │ gavin hood      │
├────┼────────────────────────────────────────────┼─────────────────────────────────────────┼─────────────────┤
│  1 │ pirates of the caribbean dead mans chest   │ action|adventure|fantasy                │ gore verbinski  │
├────┼────────────────────────────────────────────┼─────────────────────────────────────────┼─────────────────┤
│  2 │ hitchhikers guide to the galaxy the        │ adventure|comedy|sci-fi                 │ garth jennings  │
├────┼──────

In [106]:
unique_users = df_full['userId'].unique()[:10]
print(" Unique User IDs:")
for uid in unique_users:
    print(uid)


 Unique User IDs:
36641.0
70379.0
88400.0
171266.0
77024.0
184662.0
123557.0
8032.0
25622.0
58077.0
