In [1]:
# ================================
# 📌 02 - User Profile Construction
# ================================

# ✅ Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


In [2]:
# ✅ Load cleaned news data
df_news = pd.read_csv('clean_news.csv')

In [3]:
# Display structure
print("News Data Sample:")
print(df_news.head())


News Data Sample:
  news_id   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                       clean_content  
0  brands queen elizabeth prince charles prince p...  
1  50 worst habits belly fat seemingly harmless h...  
2  cost trumps aid freeze trenches ukraines war l...  
3  nba wife heres affected mental health felt lik...  
4  get rid skin tags according dermatologist seem...  


In [5]:

from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Sample data
documents = ["This is a sample document.", "Another sample document for TF-IDF."]

# Create TF-IDF model
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Save the model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)


In [6]:
# Re-transform all article content into vectors
tfidf_matrix = tfidf_vectorizer.transform(df_news['clean_content'])

In [7]:
# ===============================
# 👤 Step 2: Simulate User Preferences
# ===============================
# Option 1: User selects preferred categories
user_preferred_categories = ['sports', 'technology']  # You can modify this list

In [8]:
# Filter articles that belong to these categories
user_articles = df_news[df_news['category'].isin(user_preferred_categories)]

In [10]:
# Option 2 (Alternative): User selects article IDs (e.g., from past clicks)
user_clicked_ids = ['N12345', 'N67890']
user_articles = df_news[df_news['news_id'].isin(user_clicked_ids)]


In [11]:
print(f"\nUser selected {len(user_articles)} preferred articles.")


User selected 1 preferred articles.


In [12]:
# ===============================
# 📐 Step 3: Construct User Profile Vector
# ===============================
# Transform selected articles to TF-IDF
user_vectors = tfidf_vectorizer.transform(user_articles['clean_content'])
