# Imports

In [None]:
import re
import nltk
import json
import faiss
import requests
import numpy as np
from nltk.corpus import stopwords
from transformers import pipeline
from dataclasses import dataclass
from typing import List, Dict, Any
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups
from gensim.corpora.dictionary import Dictionary
from sentence_transformers import SentenceTransformer
from gensim.models.coherencemodel import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.cluster import normalized_mutual_info_score

In [None]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

In [None]:
ip_port = "{ip}:{port}"

In [None]:
# Define the URL and payload
url = f"http://{ip_port}/api/pull"

In [None]:
payload = {
    "model": "deepseek-r1:32b"
}

# Make a POST request
response = requests.post(url, json=payload)

# Check the response
if response.status_code == 200:
    print("Response:", response)

In [None]:
url = f"http://{ip_port}/api/generate"

# Test the connection to the llm

In [None]:
payload = {
    "model": "deepseek-r1:32b",
    "prompt": "what is systematic review?",
    "temperature": 0.7
}

# Make a POST request
response = requests.post(url, json=payload)
response
# Print the response
if response.status_code == 200:
    print("Response:", response)

In [None]:
full_text = ""

for line in response.text.splitlines():
    try:
        data = json.loads(line)  # Parse each JSON line
        if "response" in data:  # Check if the "response" field exists
            full_text += data["response"]  # Append the text
    except json.JSONDecodeError:
        print("Unable to parse line:", line)
full_text

# Class for llm response

In [None]:
@dataclass
class LLMResponse:
    content: str
    score: float = None
    topic: str = None
    confidence: float = None
    error: str = None
    metadata: Dict[str, Any] = None

    def __post_init__(self):
        if self.metadata is None:
            self.metadata = {}

# Class for document

In [None]:
@dataclass
class Document:
    content: str
    embedding: np.ndarray = None
    metadata: Dict[str, Any] = None

# Retrieval-augmented generation class

In [None]:
class RAGSystem:
    def __init__(self, embedding_model="all-MiniLM-L6-v2"):
        """
        Initializes the RAG system:
        Loads the specified embedding model.
        Creates an empty document store and sets the FAISS index to None.
        """
        self.embedding_model = SentenceTransformer(embedding_model)
        self.document_store = []
        self.index = None

    def add_documents(self, documents: List[str]):
        """
        Adds documents to the system and prepares them for retrieval
        Converts each document into a vector embedding using the SentenceTransformer model.
        Each document is wrapped in a Document object containing:
          content: The text of the document.
          embedding: The corresponding embedding (as a NumPy array).
        These objects are appended to document_store.
          """
        embeddings = self.embedding_model.encode(documents, convert_to_tensor=True)
        for doc, emb in zip(documents, embeddings):
            self.document_store.append(Document(
                content=doc,
                embedding=emb.numpy()
            ))
        self._update_index()

    def _update_index(self):
        """
        Updates the FAISS index with embeddings from all stored documents
        """
        embeddings = np.vstack([doc.embedding for doc in self.document_store])
        dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(embeddings.astype('float32'))

    def retrieve_relevant_docs(self, query: str, k: int = 3):
        """
        Retrieves the top-k documents most relevant to a given query
        """
        query_embedding = self.embedding_model.encode([query])[0]
        D, I = self.index.search(query_embedding.reshape(1, -1).astype('float32'), k)
        return [self.document_store[i] for i in I[0]]

# Function to find certain pattern in text

In [None]:
def find_pattern_safely(pattern, text, default=None):
    if not text:
        return default
    try:
        match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
        if match:
            return match.group(1).strip()
    except Exception:
        pass
    return default

# Function to clean topic from text

In [None]:
def clean_topic(topic):
    if not topic:
        return None

    topic = re.sub(r'^\d+\.\s*', '', topic)
    topic = re.sub(r'^-\s*', '', topic)
    topic = re.sub(r'^$', '', topic)
    topic = re.sub(r'\b\d+millisecond\b', '', topic)
    topic = re.sub(r'\s+and\s+', ' & ', topic)
    topic = ' '.join(topic.split())
    topic = topic.strip()

    if len(topic) < 3:
        return None

    return topic

# Function to clean and preprocess text

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    custom_stops = {'would', 'could', 'should', 'said', 'like', 'also'}
    stop_words.update(custom_stops)

    filtered_tokens = []
    for token in tokens:
        if (token not in stop_words and
            len(token) > 2 and
            not token.isnumeric() and
            not all(c in '0123456789.-' for c in token)):

            if token.isupper() and len(token) <= 5:
                filtered_tokens.append(token)
            else:
                filtered_tokens.append(token.lower())

    cleaned_text = " ".join(filtered_tokens)
    return cleaned_text if cleaned_text.strip() else "placeholder"

# Function to clean and preprocess group of documents

In [None]:
def preprocess_documents(documents):
    processed_docs = []
    for doc in documents:
        try:
            if isinstance(doc, str):
                # Basic cleaning
                doc = doc.lower()
                doc = re.sub(r'\s+', ' ', doc)
                doc = re.sub(r'[^\w\s-]', '', doc)

                # Tokenize
                tokens = word_tokenize(doc)
                stop_words = set(stopwords.words('english'))
                custom_stops = {'would', 'could', 'should', 'said', 'like', 'also'}
                stop_words.update(custom_stops)

                # Filter tokens
                filtered_tokens = []
                for token in tokens:
                    if (token not in stop_words and
                        len(token) > 2 and
                        not token.isnumeric() and
                        not all(c in '0123456789.-' for c in token)):
                        filtered_tokens.append(token.lower())

                if filtered_tokens:  # Only append if we have tokens
                    processed_docs.append(filtered_tokens)
                else:
                    processed_docs.append(['placeholder'])  # Add placeholder if no tokens
            else:
                processed_docs.append(['placeholder'])

        except Exception as e:
            print(f"Error preprocessing document: {e}")
            processed_docs.append(['placeholder'])  # Add placeholder on error

    return processed_docs

# Function to perform lda analysis on documents and returns topics

In [None]:
def perform_lda_analysis(documents, n_topics=5):
    if not documents or not isinstance(documents, list):
        return {"assigned_topics": [], "topics_keywords": {}}

    try:
        vectorizer = CountVectorizer(
            stop_words='english',
            max_df=0.95,
            min_df=2,
            token_pattern=r'(?u)\b\w+\b'
        )

        X = vectorizer.fit_transform(documents)

        lda_model = LatentDirichletAllocation(
            n_components=n_topics,
            random_state=42,
            max_iter=20,
            learning_method='batch'
        )

        lda_model.fit(X)
        feature_names = vectorizer.get_feature_names_out()

        topics = {}
        for topic_idx, topic in enumerate(lda_model.components_):
            top_keywords_idx = topic.argsort()[-10:][::-1]
            top_keywords = [feature_names[i] for i in top_keywords_idx]
            topics[topic_idx] = top_keywords

        topic_assignments = lda_model.transform(X)
        assigned_topics = np.argmax(topic_assignments, axis=1)

        return {
            "assigned_topics": assigned_topics.tolist(),
            "topics_keywords": topics
        }

    except Exception as e:
        print(f"Error in LDA analysis: {str(e)}")
        return {"assigned_topics": [], "topics_keywords": {}}

# Function that calculate coherence scores on group of documents

In [None]:
def calculate_coherence_scores(groups, dictionary, measure="c_v"):
    scores = []
    for group in groups:
        try:
            # Create "topics" as a list of the most frequent terms in the group
            topics = [[word for word, freq in dictionary.doc2bow(doc)] for doc in group]

            # Create a CoherenceModel for the group
            coherence_model = CoherenceModel(
                topics=topics,
                texts=group,
                dictionary=dictionary,
                coherence=measure
            )

            # Calculate the coherence score
            score = coherence_model.get_coherence()
            scores.append(score)
        except Exception as e:
            print(f"Error calculating coherence for group: {e}")
            scores.append(0.0)

    return scores

In [None]:
class EnhancedLLMProcessor:
    def __init__(self, api_key: str, model: str = "deepseek-r1:32b"):

        self.model = model
        self.api_url = url
        self.prompts = {
            "summarize": """You are a technical document analyzer specializing in extracting key information from texts.

TEXT TO ANALYZE:
{text}

RELEVANT CONTEXT:
{context}

Provide your analysis in this EXACT format:
MAIN_TOPIC: [primary technical/scientific field]
KEY_TERMS: [list only the most relevant technical terms, comma-separated]
SUMMARY: [2-3 concise, technical sentences capturing the essence]""",

            "topic": """You are an expert topic classifier focusing on technical and academic content.

Document for classification:
{text}

Analyze this document following these steps:
1. Identify primary technical domain
2. Extract key technical terminology
3. Recognize methodological approaches
4. Note any cross-domain elements

Provide classification in this EXACT format:
PRIMARY_TOPIC: [single specific technical field]
SUBTOPICS: [3-4 related technical areas]
TECHNICAL_INDICATORS: [key technical terms that influenced classification]
CROSS_DOMAIN_ELEMENTS: [any interdisciplinary aspects]
CONFIDENCE: [0-1 score with brief justification]""",

            "grade": """You are a specialized content coherence evaluator.

TARGET GROUP:
{documents}

COMPARISON GROUPS:
{other_groups}

Evaluation Criteria:
1. INTERNAL COHERENCE (50%)
- How consistently do the documents align in topic and terminology?
- Do they share a common technical vocabulary?
- Is there thematic continuity?

2. EXTERNAL DISTINCTIVENESS (50%)
- How clearly separated is this group from others?
- Are there unique technical markers?
- Is there minimal topic overlap with other groups?

IMPORTANT SCORING GUIDELINES:
- Groups that are inconsistent or contain off-topic entries should receive a COHERENCE_SCORE below 5.
- If group topics overlap significantly with other groups, DISTINCTIVENESS_SCORE should be below 5.
- Use the full range of 1–10.
- Be strict in low-quality cases.

Examples:
  - Mixed documents with weak cohesion → COHERENCE_SCORE: 3
  - Unclear group separation → DISTINCTIVENESS_SCORE: 4
  - Strong thematic continuity → COHERENCE_SCORE: 9"""
        }

    def process_text(self, text: str, task: str, additional_context: Dict = None) -> LLMResponse:
        try:
            prompt_template = self.prompts.get(task)
            if not prompt_template:
                raise ValueError(f"Unknown task: {task}")

            context = {
                "text": text,
                "context": "",
                "documents": "",
                "other_groups": ""
            }
            if additional_context:
                context.update(additional_context)

            prompt = prompt_template.format(**context)

            payload = {
                "model": self.model,
                "prompt": prompt,
                "temperature": 0.2,
                "top_p": 0.2,
                "stream": False
            }

            response = requests.post(self.api_url, json=payload)
            response.raise_for_status()
            content = response.json()["response"].strip()

            if task == "grade":
                score = self._extract_score_from_response(content)
                return LLMResponse(content=content, score=score, metadata={})
            elif task == "topic":
                topic_info = self._extract_topic_info(content)
                return LLMResponse(
                    content=content,
                    topic=topic_info["primary_topic"],
                    confidence=topic_info["confidence"],
                    metadata=topic_info
                )

            return LLMResponse(content=content)

        except Exception as e:
            print(f"Error in processing {task}: {e}")
            return LLMResponse(content="", error=str(e))

    def _extract_score_from_response(self, response_text):
        try:
            final_match = re.search(r'FINAL_SCORE:\s*(\d+)', response_text)
            if final_match:
                return int(final_match.group(1))
            return 5
        except Exception:
            return 5

    def _extract_topic_info(self, response_text):
        try:
            return {
                "primary_topic": self._find(r'PRIMARY_TOPIC:\s*([^\n]+)', response_text, "unknown"),
                "subtopics": self._find(r'SUBTOPICS:\s*([^\n]+)', response_text, "").split(','),
                "technical_indicators": self._find(r'TECHNICAL_INDICATORS:\s*([^\n]+)', response_text, "").split(','),
                "cross_domain": self._find(r'CROSS_DOMAIN_ELEMENTS:\s*([^\n]+)', response_text, "").split(','),
                "confidence": float(self._find(r'CONFIDENCE:\s*(0\.\d+|1\.0)', response_text, "0.5"))
            }
        except Exception:
            return {
                "primary_topic": "unknown",
                "subtopics": [],
                "technical_indicators": [],
                "cross_domain": [],
                "confidence": 0.5
            }

    def _find(self, pattern, text, default):
        match = re.search(pattern, text)
        return match.group(1).strip() if match else default

    def _calculate_group_coherence(self, documents):
        """Calculate internal coherence of a group of documents"""
        try:
            processed_docs = preprocess_documents(documents)
            dictionary = Dictionary(processed_docs)
            coherence_scores = calculate_coherence_scores([processed_docs], dictionary)
            return coherence_scores[0] if coherence_scores else 0.0
        except Exception as e:
            print(f"Error calculating group coherence: {e}")
            return 0.0

In [None]:
llm_processor = EnhancedLLMProcessor(url)

# Function that assign topic to group of documents in case topic is not given

In [None]:
def assign_topic_to_group(documents, n_topics=5):
    # Get LDA topics
    lda_results = perform_lda_analysis(documents, n_topics)

    # Get LLM topic analysis
    llm_topics = set()  # Use set for automatic deduplication
    try:
        for doc in documents:
            response = llm_processor.process_text(doc, "topic")
            if response and not response.error and response.content:
                # Extract topics with fallbacks
                primary = find_pattern_safely(r'PRIMARY_TOPIC:\s*([^\n]+)', response.content)
                subtopics = find_pattern_safely(r'SUBTOPICS:\s*([^\n]+)', response.content)
                tech_indicators = find_pattern_safely(r'TECHNICAL_INDICATORS:\s*([^\n]+)', response.content)
                cross_domain = find_pattern_safely(r'CROSS_DOMAIN_ELEMENTS:\s*([^\n]+)', response.content)

                # Process primary topic
                if primary:
                    clean_primary = clean_topic(primary)
                    if clean_primary:
                        llm_topics.add(clean_primary)

                # Process subtopics
                if subtopics:
                    for topic in subtopics.split(','):
                        clean_sub = clean_topic(topic)
                        if clean_sub:
                            llm_topics.add(clean_sub)

                # Process technical indicators
                if tech_indicators:
                    for term in tech_indicators.split(','):
                        clean_term = clean_topic(term)
                        if clean_term and len(clean_term.split()) > 1:  # Only add multi-word technical terms
                            llm_topics.add(clean_term)

                # Process cross-domain elements
                if cross_domain:
                    for element in cross_domain.split(','):
                        clean_element = clean_topic(element)
                        if clean_element:
                            llm_topics.add(clean_element)

    except Exception as e:
        print(f"Error in LLM topic analysis: {str(e)}")

    # Convert set to sorted list for consistent output
    llm_topics_list = sorted(list(llm_topics))

    # Group similar topics
    grouped_topics = []
    processed = set()

    for topic in llm_topics_list:
        if topic in processed:
            continue

        similar_topics = [topic]
        processed.add(topic)

        # Find similar topics
        for other in llm_topics_list:
            if other not in processed:
                # Check if topics are very similar
                if (topic.lower() in other.lower() or
                    other.lower() in topic.lower() or
                    len(set(topic.lower().split()) & set(other.lower().split())) >= 2):
                    similar_topics.append(other)
                    processed.add(other)

        # Add the main topic or the shortest similar topic
        if len(similar_topics) > 1:
            grouped_topics.append(min(similar_topics, key=len))
        else:
            grouped_topics.append(topic)

    result = {
        "lda_results": lda_results,
        "llm_topics": grouped_topics,
        "combined_analysis": {
            "assigned_topics": lda_results["assigned_topics"],
            "topics_keywords": lda_results["topics_keywords"],
            "llm_suggested_topics": grouped_topics
        }
    }

    return result

# Function that get a dataset, group categories and documents per category and return balanced dataset for the experiment

In [None]:
def get_balanced_dataset(dataset, category_groups, docs_per_category=3):

    group_docs = []
    category_counts = {}

    for group_categories in category_groups:
        group_data = []
        group_total = 0

        for category in group_categories:
            category_indices = [i for i in range(len(dataset.target))
                              if dataset.target_names[dataset.target[i]] == category]

            # Get and preprocess documents
            category_docs = [preprocess_text(dataset.data[i])
                           for i in category_indices[:docs_per_category]]
            group_data.extend(category_docs)

            category_counts[category] = len(category_docs)
            group_total += len(category_docs)

        group_docs.append(group_data)

        print(f"\nGroup with categories {group_categories}:")
        print(f"Total documents: {group_total}")
        for category in group_categories:
            print(f"  - {category}: {category_counts[category]} documents")

    return group_docs, category_counts

# Function that evaluate the topics and group documents several times for more consistent and reliable results

In [None]:
def evaluate_multiple_times(group1, group2, group3, topics=None, num_iterations=3):

    if topics is None:
        topics = ['Technology', 'Scientific', 'Social/Political']

    scores = {
        'llm_scores': {topic: [] for topic in topics},
        'coherence_scores': {topic: [] for topic in topics}
    }

    print("Summarizing documents...")
    summarized_group1 = [llm_processor.process_text(doc, "summarize").content for doc in group1]
    summarized_group2 = [llm_processor.process_text(doc, "summarize").content for doc in group2]
    summarized_group3 = [llm_processor.process_text(doc, "summarize").content for doc in group3]

    # Preprocess all groups once
    all_groups = [group1, group2, group3]
    tokenized_groups = [preprocess_documents(group) for group in all_groups]

    # Create dictionary from all documents
    all_docs_tokenized = [token for group in tokenized_groups for token in group]
    dictionary = Dictionary(all_docs_tokenized)
    dictionary.filter_extremes(no_below=2, no_above=0.95)  # Filter extreme terms

    for i in range(num_iterations):
        print(f"\nIteration {i + 1}/{num_iterations}")
        print("Performing LLM evaluation...")

        for j, (group, topic) in enumerate(zip([summarized_group1, summarized_group2, summarized_group3], topics)):
            other_groups = [g for k, g in enumerate([summarized_group1, summarized_group2, summarized_group3]) if k != j]

            # Call LLM and store response
            response = llm_processor.process_text("", "grade", {
                "documents": "\n".join(group),
                "other_groups": "\n".join(["\n".join(g) for g in other_groups])
            })

            # 🔍 Print raw LLM output for inspection
            print(f"\n🔍 Raw LLM Grade Response for topic '{topic}':\n{response.content}\n")

            # Store score
            scores['llm_scores'][topic].append(response.score)


    print("Calculating coherence scores...")
    coherence_scores = calculate_coherence_scores(tokenized_groups, dictionary)
    for topic, score in zip(topics, coherence_scores):
        scores['coherence_scores'][topic] = [score]  # Single coherence score per group

    # Calculate results
    results = {}
    for score_type in ['llm_scores', 'coherence_scores']:
        results[score_type] = {
            'scores': scores[score_type],
            'avg': {
                topic: sum(topic_scores) / len(topic_scores) if topic_scores else 0.0
                for topic, topic_scores in scores[score_type].items()
            },
            'std': {
                topic: (
                    (sum((x - (sum(topic_scores) / len(topic_scores))) ** 2 for x in topic_scores) / len(topic_scores)) ** 0.5
                    if len(topic_scores) > 0 else 0.0
                )
                for topic, topic_scores in scores[score_type].items()
            }
        }
    return results