In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import torch
from tqdm import tqdm
import faiss

class NewsRecommender:
    def __init__(self, model_name='multi-qa-MiniLM-L6-cos-v1', batch_size=32):
        """
        Initialize the news recommender system.
        """
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size
        self.embeddings = None
        self.news_df = None
        self.index = None
        
    def _combine_text(self, row):
        """Combine title and abstract"""
        # Title is typically more important for recommendation
        return f"{row['title']} {row['title']} {row['abstract']}"
    
    def fit(self, news_df):
        """
        Process the news articles and build the recommendation index
        
        Parameters:
        news_df (pd.DataFrame): DataFrame containing 'title' and 'abstract'
        """
        self.news_df = news_df.copy()
        
        # Prepare texts
        texts = []
        for _, row in self.news_df.iterrows():
            texts.append(self._combine_text(row))
        
        # Generate embeddings in batches
        embeddings = []
        for i in tqdm(range(0, len(texts), self.batch_size), desc="Generating embeddings"):
            batch = texts[i:i + self.batch_size]
            with torch.no_grad():
                batch_embeddings = self.model.encode(batch)
                embeddings.append(batch_embeddings)
        
        self.embeddings = np.vstack(embeddings)
        
        # Normalize embeddings for cosine similarity
        self.embeddings = normalize(self.embeddings)
        
        # Build FAISS index for efficient similarity search
        self.index = faiss.IndexFlatIP(self.embeddings.shape[1])
        self.index.add(self.embeddings.astype('float32'))
        
        return self
    
    def get_recommendations(self, article_id, n_recommendations=5, exclude_same_category=False):
        """
        Get article recommendations based on a source article.
        
        Parameters:
        article_id: ID of the source article
        n_recommendations: Number of recommendations to return
        exclude_same_category: Whether to exclude articles from the same category
        
        Returns:
        pd.DataFrame: Recommended articles with similarity scores
        """
        # Get article index
        article_idx = self.news_df[self.news_df['news_id'] == article_id].index[0]
        query_vector = self.embeddings[article_idx].reshape(1, -1)
        
        # Get more recommendations than needed in case we filter some out
        k = n_recommendations
        if exclude_same_category:
            k = min(n_recommendations * 3, len(self.news_df))
            
        # Find similar articles
        scores, indices = self.index.search(query_vector.astype('float32'), k)
        
        # Create recommendations dataframe
        recommendations = self.news_df.iloc[indices[0]].copy()
        recommendations['similarity_score'] = scores[0]
        
        # Filter out the source article
        recommendations = recommendations[recommendations['news_id'] != article_id]
        
        if exclude_same_category:
            source_category = self.news_df.loc[article_idx, 'category']
            recommendations = recommendations[recommendations['category'] != source_category]
            recommendations = recommendations.head(n_recommendations)
        
        return recommendations.reset_index(drop=True)
    
    def get_recommendations_from_history(self, article_ids, n_recommendations=5):
        """
        Get recommendations based on a user's reading history.
        
        Parameters:
        article_ids: List of article IDs from user's history
        n_recommendations: Number of recommendations to return
        
        Returns:
        pd.DataFrame: Recommended articles with similarity scores
        """
        # Get indices for history articles
        history_indices = [self.news_df[self.news_df['news_id'] == id_].index[0] 
                         for id_ in article_ids]
        
        # Average the embeddings of history articles
        query_vector = self.embeddings[history_indices].mean(axis=0).reshape(1, -1)
        query_vector = normalize(query_vector)
        
        # Find similar articles
        scores, indices = self.index.search(query_vector.astype('float32'), 
                                          n_recommendations + len(article_ids))
        
        # Create recommendations dataframe
        recommendations = self.news_df.iloc[indices[0]].copy()
        recommendations['similarity_score'] = scores[0]
        
        # Filter out articles from history
        recommendations = recommendations[~recommendations['news_id'].isin(article_ids)]
        recommendations = recommendations.head(n_recommendations)
        
        return recommendations.reset_index(drop=True)

# Example usage
def create_recommender(news_path):
    """
    Create and train a news recommender system.
    
    Parameters:
    news_path (str): Path to the MIND dataset news file
    
    Returns:
    NewsRecommender: Trained recommender system
    """
    # Read MIND dataset news file
    news_df = pd.read_csv(news_path, sep='\t', 
                         names=['news_id', 'category', 'subcategory', 'title', 
                               'abstract', 'url', 'title_entities', 'abstract_entities'])
    
    # Initialize and train recommender
    recommender = NewsRecommender()
    recommender.fit(news_df)
    
    return recommender

In [None]:
# Create and train recommender
recommender = create_recommender("path/to/mind/news.tsv")

# Get recommendations for a single article
recommendations = recommender.get_recommendations("N1234", n_recommendations=5)

# Get recommendations based on reading history
history_recommendations = recommender.get_recommendations_from_history(
    ["N1234", "N5678"], n_recommendations=5
)