In [17]:
# ✅ Imports
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

In [18]:
# ===============================
# 📂 Step 1: Load Cleaned Data
# ===============================
# Load news articles with preprocessed content
df_news = pd.read_csv('clean_news.csv')


In [19]:
# Load the saved TF-IDF matrix and vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)


In [20]:
# Load or transform the text data to TF-IDF matrix
# If the TF-IDF matrix is already computed and saved
try:
    # Try to load the pre-computed TF-IDF matrix
    tfidf_matrix = sp.load_npz('tfidf_matrix.npz')
    print("Loaded pre-computed TF-IDF matrix")
except FileNotFoundError:
    # If not found, compute it from the clean content
    print("Computing TF-IDF matrix from clean content")
    tfidf_matrix = tfidf_vectorizer.transform(df_news['clean_content'])
    # Optionally save for future use
    sp.save_npz('tfidf_matrix.npz', tfidf_matrix)

print("TF-IDF matrix shape:", tfidf_matrix.shape)

Computing TF-IDF matrix from clean content
TF-IDF matrix shape: (51282, 8)


In [22]:
# ===============================
# 📂 Step 2: Load User Profile Vector
# ===============================
try:
    with open('user_profile_vector.pkl', 'rb') as f:
        user_profile_vector = pickle.load(f)
    
    # Make sure the user profile is in the correct format
    # It should be a 2D array or sparse matrix with shape (1, n_features)
    if isinstance(user_profile_vector, np.ndarray):
        # Convert to 2D array if it's 1D
        if user_profile_vector.ndim == 1:
            user_profile_vector = user_profile_vector.reshape(1, -1)
    elif isinstance(user_profile_vector, csr_matrix):
        # If it's already a sparse matrix, ensure it's 2D
        if user_profile_vector.shape[0] != 1:
            user_profile_vector = user_profile_vector.reshape(1, -1)
    else:
        raise TypeError("User profile vector must be a numpy array or sparse matrix")
    
    print("User profile vector shape:", user_profile_vector.shape)
except FileNotFoundError:
    print("The file 'user_profile_vector.pkl' was not found.")
except TypeError as e:
    print(f"TypeError: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


User profile vector shape: (1, 8)


In [26]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

try:
    # Convert user_profile_vector to numpy array
    if isinstance(user_profile_vector, np.matrix):
        user_profile_vector = np.asarray(user_profile_vector).reshape(1, -1)
    
    # Convert tfidf_matrix to dense array if it's sparse
    if isinstance(tfidf_matrix, csr_matrix):
        tfidf_matrix_dense = tfidf_matrix.toarray()
    else:
        tfidf_matrix_dense = np.asarray(tfidf_matrix)
    
    # Compute cosine similarity between user profile and all articles
    similarity_scores = cosine_similarity(user_profile_vector, tfidf_matrix_dense)
    
    # Flatten the result to 1D array
    similarity_scores = similarity_scores.flatten()
    
    print(f"Successfully computed similarity scores for {len(similarity_scores)} articles")
except Exception as e:
    print(f"Error computing cosine similarity: {e}")
    print("\nDebug information:")
    print(f"  - user_profile_vector type: {type(user_profile_vector)}")
    print(f"  - tfidf_matrix type: {type(tfidf_matrix)}")
    exit(1)



Successfully computed similarity scores for 51282 articles


In [28]:
# ===============================
# 📊 Step 4: Add Similarity Scores
# ===============================
# Add similarity scores to the news dataframe
df_news['similarity_score'] = similarity_scores

# Sort by score (highest first)
df_news_sorted = df_news.sort_values(by='similarity_score', ascending=False)

# Print column names to verify
print("Column names in df_news_sorted:", df_news_sorted.columns)

# Show top 5 most similar articles
try:
    print("\nTop 5 most similar articles:")
    print(df_news_sorted[['news_id', 'title', 'category', 'similarity_score']].head())
except KeyError as e:
    print(f"KeyError: {e}. Please check the column names in the dataframe.")


Column names in df_news_sorted: Index(['news_id', 'category', 'subcategory', 'clean_content',
       'similarity_score'],
      dtype='object')

Top 5 most similar articles:
KeyError: "['title'] not in index". Please check the column names in the dataframe.
