In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv('../data/cleaned_news.csv')

# Load user profile
with open('../data/user_profile.txt', 'r') as f:
    user_profile = f.read()

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Transform the news articles and the user profile
tfidf_matrix = vectorizer.fit_transform(df['cleaned_content'])
user_vec = vectorizer.transform([user_profile])
# Compute cosine similarities
similarities = cosine_similarity(user_vec, tfidf_matrix).flatten()

# Add similarity scores to the dataframe
df['similarity'] = similarities

# Sort by similarity
df_sorted = df.sort_values(by='similarity', ascending=False)

# Save the sorted results (articles ranked by similarity)
df_sorted.to_csv('../data/sorted_similarities.csv', index=False)