## Reading the articles data



In [2]:
import json
import logging
import os
import time
from datetime import datetime
from typing import Dict, List, Optional, Tuple

# Third-party
from anthropic import Anthropic
from dotenv import load_dotenv
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, ConnectionFailure
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm, trange



In [3]:
import pandas as pd
articles = pd.read_csv('fa_articles.csv', encoding='utf-8')
articles.head()

Unnamed: 0,content,title,artDate
0,Posle nesreće izazvane obrušavanjem nadstrešn...,"UKCV: Nema novih žrtava, troje povređenih i da...",2024-11-02
1,Urušavanje nadstrešnice Železničke stanice u ...,Arhitekta: Urušavanje dela Železničke stanice ...,2024-11-01
2,Centar za lokalnu samoupravu (CLS) saopštio j...,CLS: JKP Gradska čistoća duguje budžetu Beogra...,2024-11-01
3,"Zvaničnici Evropske unije i više država, među...",Zvaničnici EU i više država izrazili saučešće ...,2024-11-01
4,Pravni fakultet u Beogradu saopštio je da inf...,Pravni fakultet: Koleginica nagazila na utični...,2024-10-31


In [6]:
# Initialize Embedić model for Serbian language embeddings
# !pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer
import json

### Initializing the embeddings for Serbian

In [7]:
model = SentenceTransformer('djovak/embedic-large')

### A function for concatenating the article's title and content and passing to the embeddings

In [8]:
def generate_embeddings(df: pd.DataFrame, batch_size: int = 32) -> pd.DataFrame:
    """
    Generate embeddings for concatenated title and content using Embedić.
    
    Args:
        df: DataFrame with 'title' and 'content' columns
        batch_size: Number of texts to process at once
        
    Returns:
        DataFrame with added 'embedding' column
    """
    # Create a copy of the dataframe
    df_emb = df.copy()
    
    # Concatenate title and content
    print("Concatenating title and content...")
    df_emb['text_for_embedding'] = df_emb['title'] + " " + df_emb['content']
    
    # Generate embeddings in batches
    print("Generating embeddings...")
    embeddings = []
    
    # Convert texts to list for batch processing
    texts = df_emb['text_for_embedding'].tolist()
    
    # Process in batches with progress bar
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_text = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch_text)
        embeddings.extend(batch_embeddings.tolist())
    
    # Add embeddings to dataframe
    df_emb['embedding'] = embeddings
    
    # Remove temporary concatenated text column
    df_emb = df_emb.drop('text_for_embedding', axis=1)
    
    print(f"Generated embeddings for {len(df_emb)} articles")
    print(f"Embedding dimension: {len(embeddings[0])}")
    
    return df_emb

In [None]:
df_with_embeddings = generate_embeddings(articles)

Concatenating title and content...
Generating embeddings...


  0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings for 20 articles
Embedding dimension: 1024


### Preview the generated embeddings

In [11]:
# Preview the results
print("\nDataFrame shape:", df_with_embeddings.shape)
print("\nColumns:", df_with_embeddings.columns.tolist())

# Sample embedding vector (first 5 dimensions)
print("\nSample embedding (first 5 dimensions):")
print(df_with_embeddings['embedding'].iloc[0][:5])

# Verify embedding dimensions are consistent
embedding_lengths = df_with_embeddings['embedding'].apply(len)
print("\nAll embeddings have same dimension:", embedding_lengths.nunique() == 1)
print("Embedding dimension:", embedding_lengths.iloc[0])


DataFrame shape: (20, 4)

Columns: ['content', 'title', 'artDate', 'embedding']

Sample embedding (first 5 dimensions):
[0.036100856959819794, -0.03139951825141907, -0.02158968150615692, -0.02952212281525135, 0.06143483147025108]

All embeddings have same dimension: True
Embedding dimension: 1024


### Preparing the data for MongoDB

In [12]:
def prepare_for_mongodb(df: pd.DataFrame) -> List[Dict]:
    """
    Convert DataFrame rows to MongoDB documents.
    
    Args:
        df: DataFrame with embeddings
        
    Returns:
        List of dictionaries ready for MongoDB insertion
    """
    documents = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        document = {           
            'title': row['title'],
            'content': row['content'],
            'date': row['artDate'],  # Assuming you have a date column
            'embedding': row['embedding'],
            'metadata': {
                'embedding_model': 'djovak/embedic-large',
                'created_at': datetime.now(),
                'last_updated': datetime.now()
            }
        }
        documents.append(document)
    
    return documents

In [13]:
# Convert DataFrame to MongoDB documents
mongodb_documents = prepare_for_mongodb(df_with_embeddings)
print(f"\nPrepared {len(mongodb_documents)} documents for MongoDB insertion")
print("\nSample document structure (excluding embedding vector):")
sample_doc = mongodb_documents[0].copy()
sample_doc['embedding'] = f"<embedding vector with {len(sample_doc['embedding'])} dimensions>"
print(json.dumps(sample_doc, default=str, indent=2))

  0%|          | 0/20 [00:00<?, ?it/s]


Prepared 20 documents for MongoDB insertion

Sample document structure (excluding embedding vector):
{
  "title": "UKCV: Nema novih \u017ertava, troje povre\u0111enih i dalje u te\u0161kom stanju",
  "content": " Posle nesre\u0107e izazvane obru\u0161avanjem nadstre\u0161nice na \u017delezni\u010dkoj stanici u Novom Sadu, novih \u017ertava nema, a troje povre\u0111enih je u te\u0161kom op\u0161tem stanju, potvr\u0111eno je za Tanjug u Univerzitetskom klini\u010dkom centru Vojvodine (UKCV). \u201eTroje povre\u0111enih koji su ju\u010de primljeni u UKCV i dalje su u te\u0161kom stanju, na intenzivnoj nezi\u201c, re\u010deno je iz Pres-slu\u017ebe Klini\u010dkog centra za Tanjug. Portparolka De\u010dje bolnice Andrea \u0110ureti\u0107 rekla je da ni sino\u0107 u tu bolnicu nisu primljene dve devoj\u010dice. Povezane vesti U Srbiji danas Dan \u017ealosti, u Novom Sadu trodnevna Vesti 08:16 27 Vesti 08:16 27 Zavr\u0161ena akcija spasavanja: 14 stradalih i troje te\u0161ko povre\u0111enih u

### Inserting data into the MongoDB database

In [14]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [15]:
# Load environment variables
load_dotenv()

# MongoDB connection setup
MONGODB_URI = os.getenv("MONGODB_URI")
DB_NAME = os.getenv("DB_NAME_1")
COLLECTION_NAME = os.getenv("COLLECTION_NAME")

if not all([MONGODB_URI, DB_NAME, COLLECTION_NAME]):
    raise ValueError("Missing required environment variables")

In [18]:
def get_mongodb_connection():
    """
    Create and return MongoDB client, database and collection objects.
    
    Returns:
        tuple: (MongoClient, Database, Collection)
    """
    try:
        client = MongoClient(MONGODB_URI)
        # Test connection
        client.admin.command('ping')
        logger.info("Successfully connected to MongoDB")
        
        db = client[DB_NAME]
        collection = db[COLLECTION_NAME]
        return client, db, collection
        
    except Exception as e:
        logger.error(f"Failed to connect to MongoDB. Error: {str(e)}")
        raise ConnectionFailure(f"MongoDB connection failed: {str(e)}")

client, db, collection = get_mongodb_connection()

INFO:__main__:Successfully connected to MongoDB


### Function for inserting data into MongoDB

In [19]:
def insert_documents_to_mongodb(documents: List[Dict], 
                              batch_size: int = 1000) -> Tuple[int, int]:
    """
    Insert documents into MongoDB in batches.
    
    Args:
        documents: List of documents to insert
        batch_size: Number of documents to insert in each batch
        
    Returns:
        Tuple of (successful insertions, failed insertions)
    """
    successful = 0
    failed = 0
    
    print(f"Starting insertion of {len(documents)} documents...")
    
    for i in tqdm(range(0, len(documents), batch_size)):
        batch = documents[i:i + batch_size]
        try:
            # Insert batch with ordered=False for better performance
            result = collection.insert_many(batch, ordered=False)
            successful += len(result.inserted_ids)
        except BulkWriteError as e:
            # Handle partial failures in batch
            successful += e.details['nInserted']
            failed += len(batch) - e.details['nInserted']
            print(f"Batch {i//batch_size + 1} had {e.details['nInserted']} successful and "
                  f"{len(batch) - e.details['nInserted']} failed insertions")
    
    return successful, failed

### Clearing the collection and inserting data

In [20]:
collection.delete_many({})
# Insert documents
successful, failed = insert_documents_to_mongodb(mongodb_documents)
print(f"\nInsertion complete:")
print(f"Successfully inserted: {successful} documents")
print(f"Failed insertions: {failed} documents")

Starting insertion of 20 documents...


  0%|          | 0/1 [00:00<?, ?it/s]


Insertion complete:
Successfully inserted: 20 documents
Failed insertions: 0 documents


### Create a vector search index

In [21]:
{
  "fields": [
    {
      "numDimensions": 1024,
      "path": "embedding",
      "similarity": "cosine",
      "type": "vector"
    }
  ]
}

{'fields': [{'numDimensions': 1024,
   'path': 'embedding',
   'similarity': 'cosine',
   'type': 'vector'}]}

In [22]:
def create_vector_index(
    connection_string: str,
    database_name: str,
    collection_name: str,
    index_name: str = "vector_index"
) -> None:
    """
    Create a vector search index in MongoDB using PyMongo.
    
    Args:
        connection_string: MongoDB connection string
        database_name: Name of the database
        collection_name: Name of the collection
        index_name: Name of the vector index
    """
    # Connect to MongoDB
    client = client
    
    # Get database and collection
    db = db
    collection = collection
    
    # Define the index configuration
    index_config = {
        "name": index_name,
        "definition": {
            "fields": [
                {
                    "numDimensions": 1024,
                    "path": "embedding",
                    "similarity": "cosine",
                    "type": "vector"
                }
            ]
        }
    }
    
    try:
        # Create the vector search index
        collection.create_search_index(index_config)
        print(f"Successfully created vector index '{index_name}'")
        
    except Exception as e:
        print(f"Error creating vector index: {str(e)}")        
    finally:
        client.close()


In [23]:
# Create standard indexes for better query performance

#collection.create_index("articleDate")
#collection.create_index([("title", "text"), ("content", "text")])

# Verify the setup
print("\nCollection Information:")
print(f"Total documents: {collection.count_documents({})}")
print(f"Indexes: {collection.index_information()}")


Collection Information:
Total documents: 20
Indexes: {'_id_': {'v': 2, 'key': [('_id', 1)]}, 'articleDate_1': {'v': 2, 'key': [('articleDate', 1)]}, 'title_text_content_text': {'v': 2, 'key': [('_fts', 'text'), ('_ftsx', 1)], 'weights': SON([('content', 1), ('title', 1)]), 'default_language': 'english', 'language_override': 'language', 'textIndexVersion': 3}}


### Semantic search with MongoDB

In [24]:
def semantic_search(collection, query: str, k: int = 5) -> List[Dict]:
    """
    Perform semantic search using Embedić vector similarity.
    
    Args:
        collection: MongoDB collection
        query: Search query text
        k: Number of results to return
        
    Returns:
        List of matching documents
    """
    try:
        # Generate embedding for query using Embedić
        query_embedding = model.encode(query).tolist()
        
        # Perform vector search
        pipeline = [
            {
                "$vectorSearch": {
                    "index": "vector_index",
                    "queryVector": query_embedding,
                    "path": "embedding",
                    "numCandidates": 100,
                    "limit": k
                }
            },
            {
                "$project": {
                    "title": 1,
                    "content": 1,
                    "date": 1,             
                    "score": { "$meta": "vectorSearchScore" }
                }
            }
        ]
        
        results = list(collection.aggregate(pipeline))
        return results
        
    except Exception as e:
        logger.error(f"Search error: {str(e)}")
        raise

def display_search_results(query: str, results: List[Dict]):
    """
    Display search results in a readable format
    """
    print(f"\nQuery: {query}")
    print("=" * 80)
    
    for i, doc in enumerate(results, 1):
        print(f"\n{i}. {doc['title']}")
        print(f"Score: {doc['score']:.3f}")
        print(f"\nExcerpt: {doc['content'][:200]}...")
        print("-" * 80)

# Test the search
queries = [
    "Novi Sad","Lithium", "Freedom politics"
]

for query in queries:
    results = semantic_search(collection, query, k=6)
    display_search_results(query, results)
    print("\n" + "=" * 100 + "\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Novi Sad

1. Vučević: Insistiraćemo da se pronađu odgovorni za nesreću u Novom Sadu
Score: 0.836

Excerpt: Ovo je jedan od najtežih dana u posleratnoj istoriji Novog Sada i užasna tragedija, rekao je premijer Srbije Miloš Vučević. „Ovo je jedan od najtežih dana u posleratnoj istoriji Novog Sada i užasna tr...
--------------------------------------------------------------------------------

2. Pašalić: U najkraćem roku utvrditi odgovornost za tragediju u Novom Sadu
Score: 0.834

Excerpt:  Zaštitnik građana Zoran Pašalić pozvao je nadležne da preduzmu sve potrebne mere da bi se u najkraćem roku utvrdila odgovornost za pogibiju 14 ljudi na koje se danas srušila betonska nadstrešnica Žel...
--------------------------------------------------------------------------------

3. Arhitekta: Urušavanje dela Železničke stanice znak nebrige i nemara prilikom gradnje
Score: 0.833

Excerpt:  Urušavanje nadstrešnice Železničke stanice u Novom Sadu prilikom čega je najmanje jedna osoba stradala

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Lithium

1. Država traži kupce za svoj kapital u saobraćajnom preduzeću Lastra, poziva ih da predlože cenu
Score: 0.788

Excerpt: /yy_Apartment Ministarstvo privrede objavilo je javni poziv za prikupljanje pisama o zainteresovanosti za učestvovanje u postupku privatizacije saobraćajnog preduzeća Lastra iz Lazarevca, piše portal ...
--------------------------------------------------------------------------------

2. Pravni fakultet: Koleginica nagazila na utičnicu, upala neznatno u pod, komad maltera pao na drugu
Score: 0.782

Excerpt:  Pravni fakultet u Beogradu saopštio je da informacija da je jedna "studentkinja propala kroz plafon čitaonice", ne odgovara istini. (FOTO) Devojka propala kroz pod čitaonice na Pravnom fakultetu u Be...
--------------------------------------------------------------------------------

3. U Sarajevu naređena evakuacija zbog mogućeg klizišta, meštani odbili
Score: 0.780

Excerpt:  Sarajevo Direktor Civilne zaštite Kantona Sarajevo Dženan Brkanić izj

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: Freedom politics

1. Jerinić: U izmenama Krivičnog zakonika tri grupe problematičnih rešenja
Score: 0.789

Excerpt: Poslanica Zeleno-levog fronta Jelena Jerinić izjavila je da u izmenama Krivičnog zakonika i Zakonika o krivičnom postupku postoje tri grupe prolematičnih rešenja, a to su populističko pooštravanje kaz...
--------------------------------------------------------------------------------

2. Vladimir Pajić: Stari savski most je simbol borbe za normalnu i poštenu Srbiju
Score: 0.786

Excerpt:  Predstavnik Pokreta slobodnih građana (PSG) Vladimir Pajić ocenio je da je Stari savski most u Beogradu simbol borbe za normalnu, poštenu, požrtvovanu i slobodnu Srbiju i grad "koji postoji zbog ljud...
--------------------------------------------------------------------------------

3. Država traži kupce za svoj kapital u saobraćajnom preduzeću Lastra, poziva ih da predlože cenu
Score: 0.781

Excerpt: /yy_Apartment Ministarstvo privrede objavilo je javni poziv za prikupljanje pi

### K-Means clustering of the embeddings

In [26]:
!pip install scikit-learn

^C





[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
from typing import Any
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def analyze_article_topics(
    collection,
    min_k: int = 2,
    max_k: int = 10,
    embedding_field: str = "embedding"
) -> Dict[str, Any]:
    """
    Analyze topics in articles stored in MongoDB using their embeddings
    
    Args:
        collection: MongoDB collection object
        min_k: Minimum number of clusters to try
        max_k: Maximum number of clusters to try
        embedding_field: Name of the field containing embeddings
        
    Returns:
        Dictionary containing analysis results
    """
    # Fetch all documents
    documents = list(collection.find(
        {embedding_field: {"$exists": True}},
        {"title": 1, "content": 1, "date": 1, embedding_field: 1}
    ))
    
    if not documents:
        raise ValueError("No documents found with embeddings")
    
    # Extract embeddings and create a mapping of texts
    embeddings = np.array([doc[embedding_field] for doc in documents])
    
    # Create document summaries for easier reference
    doc_summaries = [
        {
            "id": str(doc["_id"]),
            "title": doc["title"],
            "date": doc["date"],
            "preview": doc["content"][:200] + "..."  # First 200 chars
        }
        for doc in documents
    ]
    
    # Find optimal number of clusters
    optimal_k, scores = get_optimal_k(embeddings, k_range=range(min_k, max_k + 1))
    
    # Perform clustering with optimal k
    labels, kmeans = cluster_documents(embeddings, k=optimal_k)
    
    # Get representative documents for each cluster
    central_docs = find_central_documents(embeddings, labels, doc_summaries)
    
    # Calculate cluster statistics
    cluster_stats = calculate_cluster_stats(doc_summaries, labels)
    
    # Organize results
    results = {
        "optimal_k": optimal_k,
        "silhouette_scores": scores,
        "cluster_assignments": [int(label) for label in labels],
        "cluster_stats": cluster_stats,
        "representative_documents": central_docs,
        "document_mapping": {
            str(doc["_id"]): {
                "cluster": int(label),
                "title": doc["title"],
                "date": doc["date"].isoformat() if isinstance(doc["date"], datetime) else doc["date"]
            }
            for doc, label in zip(documents, labels)
        }
    }
    
    return results

# Keep the helper functions from before
def calculate_cluster_stats(doc_summaries: List[Dict], labels: np.ndarray) -> List[Dict]:
    
    cluster_stats = []
    unique_labels = sorted(set(labels))
    
    for cluster_id in unique_labels:
        cluster_mask = labels == cluster_id
        cluster_docs = [doc for doc, is_in_cluster in zip(doc_summaries, cluster_mask) if is_in_cluster]
        
        dates = [
            datetime.fromisoformat(doc["date"]) if isinstance(doc["date"], str) 
            else doc["date"] 
            for doc in cluster_docs
        ]
        
        stats = {
            "cluster_id": int(cluster_id),
            "size": int(sum(cluster_mask)),
            "earliest_date": min(dates).isoformat(),
            "latest_date": max(dates).isoformat(),
            "date_range_days": (max(dates) - min(dates)).days,
            "sample_titles": [doc["title"] for doc in cluster_docs[:5]]
        }
        cluster_stats.append(stats)
    
    return cluster_stats

def cluster_documents(embeddings, k, random_state=42):
   
    kmeans = KMeans(n_clusters=k, random_state=random_state)
    labels = kmeans.fit_predict(embeddings)
    return labels, kmeans

def find_central_documents(embeddings, labels, doc_summaries, n_per_cluster=3):
   
    central_docs = {}
    
    for cluster_id in np.unique(labels):
        cluster_mask = labels == cluster_id
        cluster_embeddings = embeddings[cluster_mask]
        cluster_docs = np.array(doc_summaries)[cluster_mask]
        
        centroid = cluster_embeddings.mean(axis=0)
        distances = np.linalg.norm(cluster_embeddings - centroid, axis=1)
        closest_indices = np.argsort(distances)[:n_per_cluster]
        
        central_docs[int(cluster_id)] = cluster_docs[closest_indices].tolist()
    
    return central_docs

def get_optimal_k(embeddings, k_range=range(2, 11)):
    """Previous implementation"""
    scores = {}
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(embeddings)
        score = silhouette_score(embeddings, labels)
        scores[k] = score
        
    optimal_k = max(scores.items(), key=lambda x: x[1])[0]
    return optimal_k, {int(k): float(score) for k, score in scores.items()}

### Getting information about the discovered clusters

In [28]:
# Get MongoDB connection using your existing function
client, db, collection = get_mongodb_connection()

# Run the analysis
results = analyze_article_topics(
    collection=collection,
    min_k=2,
    max_k=10
)

# Print results
print(f"Optimal number of clusters: {results['optimal_k']}")

# Print cluster statistics
for stats in results['cluster_stats']:
    print(f"\nCluster {stats['cluster_id']}:")
    print(f"Size: {stats['size']} articles")
    print(f"Date range: {stats['earliest_date']} to {stats['latest_date']}")
    print("Sample titles:")
    for title in stats['sample_titles']:
        print(f"- {title}")

# Print representative documents
for cluster_id, docs in results['representative_documents'].items():
    print(f"\nRepresentative documents for Cluster {cluster_id}:")
    for doc in docs:
        print(f"- {doc['title']}")

INFO:__main__:Successfully connected to MongoDB


Optimal number of clusters: 8

Cluster 0:
Size: 6 articles
Date range: 2024-11-01T00:00:00 to 2024-11-02T00:00:00
Sample titles:
- UKCV: Nema novih žrtava, troje povređenih i dalje u teškom stanju
- Arhitekta: Urušavanje dela Železničke stanice znak nebrige i nemara prilikom gradnje
- Zvaničnici EU i više država izrazili saučešće građanima Srbije povodom nesreće u Novom Sadu
- Pašalić: U najkraćem roku utvrditi odgovornost za tragediju u Novom Sadu
- Direktorka KC Vojvodine: Tri osobe u veoma teškom stanju, sve su životno ugrožene

Cluster 1:
Size: 3 articles
Date range: 2024-10-31T00:00:00 to 2024-11-02T00:00:00
Sample titles:
- Uhapšen švajcarski državljanin zbog pokazivanja simbola Velike Albanije u Beogradu
- U Sarajevu naređena evakuacija zbog mogućeg klizišta, meštani odbili
- Međunarodna poternica za okorelim ubicom u Crnoj Gori: Uključene i policije Srbije i Crne Gore (VIDEO)

Cluster 2:
Size: 2 articles
Date range: 2024-10-31T00:00:00 to 2024-10-31T00:00:00
Sample titles:
- Mi

### Naming the clusters with Claude API

In [29]:
# Initialize Claude client
anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))

In [30]:
def generate_cluster_names(
    results: Dict,
    anthropic_client: Anthropic,
    use_excerpts: bool = False,
    max_retries: int = 3,
    retry_delay: int = 2
) -> Dict[int, str]:
    """
    Generate descriptive names for clusters using Claude API.
    
    Args:
        results: Dictionary containing clustering results
        anthropic_client: Initialized Anthropic client
        use_excerpts: If True, use document excerpts instead of just titles
        max_retries: Maximum number of retries for API calls
        retry_delay: Delay between retries in seconds
        
    Returns:
        Dictionary mapping cluster IDs to generated names
    """
    cluster_names = {}
    
    for cluster_id, docs in results['representative_documents'].items():
        # Prepare the content for Claude
        if use_excerpts:
            content = "\n".join([
                f"Document {i+1}:\nTitle: {doc['title']}\nExcerpt: {doc['preview']}"
                for i, doc in enumerate(docs)
            ])
        else:
            content = "\n".join([
                f"- {doc['title']}" for doc in docs
            ])
        
        # Prepare the prompt with stronger emphasis on English output
        prompt = f"""You are an international news categorization expert. The documents below are Serbian news articles.
Your task is to provide a short (2-4 words) ENGLISH LANGUAGE descriptive name for this thematic cluster.

For example:
- If articles are about "Finansijski zakoni", name it "Financial Legislation"
- If articles are about "Ekološki protesti", name it "Environmental Protests"
- If articles are about "Politička kriza", name it "Political Crisis"

Documents from cluster:
{content}

IMPORTANT: Respond ONLY with the English language cluster name, no Serbian words allowed.
Example good responses: "Economic Reform", "Infrastructure Development", "Criminal Investigation"
Example bad responses: "Finansijski zakoni", "Ekološki protesti", "Politička kriza" """

        # Try to get response with retries
        for attempt in range(max_retries):
            try:
                response = anthropic_client.messages.create(
                    model="claude-3-opus-20240229",
                    max_tokens=30,
                    temperature=0.2,
                    messages=[{
                        "role": "user",
                        "content": prompt
                    }]
                )
                
                cluster_name = response.content[0].text.strip()
                # Additional check to ensure the response is in English
                if any(c.lower() in cluster_name.lower() for c in ['č', 'ć', 'š', 'ž', 'đ']):
                    raise ValueError("Response contains Serbian characters")
                cluster_names[cluster_id] = cluster_name
                break
                
            except Exception as e:
                if attempt == max_retries - 1:
                    print(f"Failed to get name for cluster {cluster_id}: {str(e)}")
                    cluster_names[cluster_id] = f"Cluster {cluster_id}"
                else:
                    time.sleep(retry_delay)
                    continue
    
    return cluster_names

# Use the function
try:
    cluster_names = generate_cluster_names(results, anthropic)
    
    # Print results
    print("\nGenerated Cluster Names:")
    print("=" * 50)
    for cluster_id, name in cluster_names.items():
        print(f"\nCluster {cluster_id}: {name}")
        print("-" * 30)
        # Print a few sample titles for reference
        print("Sample titles:")
        for doc in results['representative_documents'][cluster_id][:2]:
            print(f"- {doc['title']}")

except Exception as e:
    print(f"Error generating cluster names: {str(e)}")

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"



Generated Cluster Names:

Cluster 0: Hospital Tragedy
------------------------------
Sample titles:
- Pašalić: U najkraćem roku utvrditi odgovornost za tragediju u Novom Sadu
- UKCV: Nema novih žrtava, troje povređenih i dalje u teškom stanju

Cluster 1: Crime and Security
------------------------------
Sample titles:
- Međunarodna poternica za okorelim ubicom u Crnoj Gori: Uključene i policije Srbije i Crne Gore (VIDEO)
- U Sarajevu naređena evakuacija zbog mogućeg klizišta, meštani odbili

Cluster 2: Criminal Code Amendments
------------------------------
Sample titles:
- Ministarstvo pravde će precizirati članove Krivičnog zakonika na koje su stigle primedbe
- Jerinić: U izmenama Krivičnog zakonika tri grupe problematičnih rešenja

Cluster 3: Savski Bridge Protests
------------------------------
Sample titles:
- Aktivisti kod Starog savskog mosta: Spremni smo na sve, ovo je borba za Beograd, odbranićemo most
- Vladimir Pajić: Stari savski most je simbol borbe za normalnu i poštenu 

In [31]:
cluster_names

{0: 'Hospital Tragedy',
 1: 'Crime and Security',
 2: 'Criminal Code Amendments',
 3: 'Savski Bridge Protests',
 4: 'Workplace Accident',
 5: 'Yugoslav Nostalgia',
 6: 'Public Enterprises',
 7: 'Police Recruitment'}

### Create summaries in English with Claude

In [37]:
def generate_news_report(
    query: str,
    collection,
    model: SentenceTransformer,
    anthropic_client: Anthropic,
    language: str = "English",
    top_k: int = 10,
    max_retries: int = 3,
    retry_delay: int = 2
) -> str:
    """
    Perform semantic search and generate a news report based on top results.
    Translates the query to Serbian before searching and generates report in specified language.
    
    Args:
        query: Search query in English
        collection: MongoDB collection
        model: Sentence transformer model for embeddings
        anthropic_client: Anthropic client
        language: Output language for the report
        top_k: Number of top articles to consider
        max_retries: Maximum number of API retries
        retry_delay: Delay between retries in seconds
    
    Returns:
        Generated news report
    """
    # Translate query to Serbian using Claude
    translation_prompt = f"Translate the following {language} text to Serbian latin. Provide only the translation, nothing else: '{query}'"
    
    try:
        translation_response = anthropic_client.messages.create(
            model="claude-3-opus-20240229",
            max_tokens=100,
            temperature=0,
            messages=[{
                "role": "user",
                "content": translation_prompt
            }]
        )
        
        serbian_query = translation_response.content[0].text.strip()
        print(serbian_query)
    except Exception as e:
        raise Exception(f"Translation failed: {str(e)}")
    
    # Generate embedding for Serbian query
    query_embedding = model.encode(serbian_query).tolist()
    
    # Perform vector search
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 100,
                "limit": top_k
            }
        },
        {
            "$project": {
                "title": 1,
                "content": 1,
                "date": 1,
                "score": { "$meta": "vectorSearchScore" }
            }
        }
    ]
    
    results = list(collection.aggregate(pipeline))
    
    # Prepare content for Claude
    articles_text = "\n\n".join([
        f"Article {i+1}:\nDate: {doc['date']}\nTitle: {doc['title']}\nContent: {doc['content'][:500]}..."
        for i, doc in enumerate(results)
    ])
    
    # Create prompt for report generation
    prompt = f"""You are an expert journalist and news analyst. Based on the following {top_k} most relevant Serbian news articles about "{query}" (translated to Serbian as "{serbian_query}"), 
create a concise, well-structured news report in {language}. The report should:
- Be around 250-300 words
- Start with a clear headline
- Include key facts, dates, and relevant context
- Maintain journalistic neutrality
- Focus on the most newsworthy aspects
- Include a brief conclusion or outlook
Here are the articles:
{articles_text}
Please write the report in a professional journalistic style."""

    # Get response from Claude with retries
    for attempt in range(max_retries):
        try:
            response = anthropic_client.messages.create(
                model="claude-3-opus-20240229",
                max_tokens=1000,
                temperature=0.3,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            
            report = response.content[0].text.strip()
            return report
            
        except Exception as e:
            if attempt == max_retries - 1:
                raise Exception(f"Failed to generate report: {str(e)}")
            time.sleep(retry_delay)
            continue

### Example usage

In [38]:
# Example usage
try:
    # Example query
    query = "Investments in mining"

    report = generate_news_report(
        query=query,
        collection=collection,
        model=model,
        anthropic_client=anthropic,
        top_k=10,
        language="English"

    )

    print("\nGenerated News Report:")
    print("=" * 80)
    print(report)

except Exception as e:
    print(f"Error generating report: {str(e)}")

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"


Investicije u rudarstvo


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"



Generated News Report:
Headline: Serbia Seeks Investors for Privatization of Transport Company Lastra

The Serbian Ministry of Economy has announced a public call for letters of interest in the privatization process of the transport company Lastra from Lazarevac. The majority shareholder of Lastra is the transport company Strela from Obrenovac, owning just over 77% of the capital, while the Serbian government, together with the Republic Pension and Disability Insurance Fund and the National Employment Service, holds the remaining 23% of shares.

In other news, the Ministry of Justice has stated that it will clarify certain provisions of the Criminal Code that have received objections during the public debate. These provisions particularly relate to the introduction of a new criminal offense regarding the publication of materials that advise the commission of a criminal offense, the deletion of extortion of statements, and the article prescribing the criminal offense of abuse related t