In [None]:
import pandas as pd
from scipy import sparse
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [None]:
# Load cleaned news data (DataFrame)
news_df = pd.read_pickle("../data/cleaned_news_df.pkl")

# Load the TF-IDF matrix
tfidf_matrix = sparse.load_npz("../data/tfidf_matrix.npz")

# Load the TF-IDF vectorizer
with open("../data/tfidf_vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

# ✅ Select real article IDs that exist in your data
# View some actual IDs if unsure:
print("Available IDs:")
print(news_df['news_id'].head(10).tolist())

# Replace with real existing IDs
preferred_ids = ['N46478', 'N47901']  # example: must exist in news_df

# Filter the preferred articles
preferred_articles = news_df[news_df['news_id'].isin(preferred_ids)]

if preferred_articles.empty:
    print("❌ No matching articles found for the provided preferred_ids.")
else:
    # Get indices of preferred articles
    preferred_indices = preferred_articles.index.tolist()

    # Build user profile vector (mean of preferred TF-IDF rows)
    user_profile_vector = tfidf_matrix[preferred_indices].mean(axis=0)

    # ✅ Ensure it's a proper array shape (1 row)
    user_profile_vector = np.asarray(user_profile_vector).reshape(1, -1)

    # Compute cosine similarity between user profile and all articles
    similarity_scores = cosine_similarity(user_profile_vector, tfidf_matrix)

    # Print similarity scores
    print("✅ Similarity Scores:")
    print(similarity_scores.flatten())

    # Save similarity scores for later use in 04
    with open("../results/similarity_scores.pkl", "wb") as f:
        pickle.dump(similarity_scores, f)

    print("✅ similarity_scores.pkl saved.")


Available IDs:
['N55528', 'N19639', 'N61837', 'N53526', 'N38324', 'N2073', 'N49186', 'N59295', 'N24510', 'N39237']
✅ Similarity Scores:
[0.         0.         0.01972448 ... 0.         0.         0.        ]
✅ similarity_scores.pkl saved.
