In [1]:
import os
from groq import Groq
import re
import yaml
import string
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict

from scipy.linalg import eigh
from sklearn.metrics import roc_auc_score

import torch
import nltk
import spacy
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

In [3]:
params_path = os.path.join(os.getcwd(), 'config.yaml')
config = None
with open(params_path) as f:
    config = yaml.safe_load(f)

In [1]:
client = Groq(
    api_key = config['api']['groq']
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            'role': 'user',
            'content': 'Who is the greatest scientist of all time? Only give the name.',
        }
    ],
    model = 'llama-3.1-8b-instant',
    max_completion_tokens=1024,
)

print(chat_completion.choices[0].message.content)

NameError: name 'Groq' is not defined

In [5]:
print(chat_completion.choices[0].message.content)

Isaac Newton


In [6]:
responces = []

# Create new groq client
client = Groq(
        api_key = config['api']['groq']
    )

for _ in range(30):
    # Generate chat
    chat_completion = client.chat.completions.create(
        messages=[
            {
                'role': 'user',
                'content': 'Who is the greatest scientist of all time?',
            }
        ],
        model = 'llama-3.1-8b-instant',
        max_completion_tokens=1024,
    )

    # Add response to list 
    responces.append(chat_completion.choices[0].message.content)

print(responces)

['It\'s difficult to identify a single greatest scientist of all time, as this is often a matter of personal opinion and can be influenced by various factors such as the field of science, historical context, and personal biases. However, I can provide you with some of the most influential scientists in various fields, who are often considered among the greatest:\n\n1. **Albert Einstein (Physics)**: Revolutionized our understanding of space, time, and gravity. His theories of special relativity (1905) and general relativity (1915) are still widely accepted and used today.\n2. **Isaac Newton (Physics)**: Laid the foundation for classical mechanics with his laws of motion (1687) and universal gravitation. His work had a profound impact on the scientific revolution of the 17th century.\n3. **Galileo Galilei (Physics)**: Pioneered the scientific method and made significant contributions to the study of motion, astronomy, and physics. His observations of the heavens challenged the geocentric

In [7]:
with open("my_sentences.txt", "a") as file:
    for sentence in responces:
        file.write('\n'+'--'*50)
        file.write(sentence)

In [8]:
len(responces)

30

In [9]:
class UncertaintyQuantifier:
    def __init__(self):
        self.word_sets = []
        self.eigenvalues = None
    
    def get_similarity_matrix(self, similarity_matrix):
        return similarity_matrix
    
    def get_eigenvalues(self):
        return self.eigenvalues

    def preprocess_sentence(self, sentence):
        words = re.findall(r'\b\w+\b', sentence.lower())
        return set(words)
    
    def jaccard_similarity(self, response1, response2):
        intersection = len(response1.intersection(response2))
        union = len(response1.union(response2))
        
        # Handle empty sets
        if union == 0:
            return 0.0
        
        return intersection / union
    
    def compute_similarity_matrix(self, responses):
        n = len(responses)
        similarity_matrix = np.zeros((n, n))
        
        # Preprocess all responses
        self.word_sets = [self.preprocess_sentence(sentence) for sentence in responses]
        print(self.word_sets)
        
        # Calculate pairwise similarities
        for i in range(n):
            for j in range(i, n):
                if i == j:
                    similarity_matrix[i, j] = 1.0  # Self-similarity is 1
                else:
                    sim = self.jaccard_similarity(self.word_sets[i], self.word_sets[j])
                    similarity_matrix[i, j] = similarity_matrix[j, i] = sim
        
        return similarity_matrix
    
    def num_semantic_sets(self, similarity_matrix, threshold = 0.5):
        n = len(similarity_matrix)
        if n <= 1:
            return n
        
        groups = list(range(n))  # Initially each response is its own group
        
        for i in range(n):
            for j in range(i + 1, n):
                # precomputed similarity from matrix
                jaccard = similarity_matrix[i, j]
                nli_scores = {
                        'entailment': jaccard,
                        'contradiction': 1 - jaccard
                }
                
                # Check bidirectional entailment
                if (nli_scores['entailment'] > threshold and 
                    nli_scores['entailment'] > nli_scores['contradiction']):
                    
                    min_group = min(groups[i], groups[j])
                    max_group = max(groups[i], groups[j])
                    groups = [min_group if g == max_group else g for g in groups]
        
        return len(set(groups))
    
    def eigenvalue_uncertainty(self, similarity_matrix):
        n = similarity_matrix.shape[0]
        if n <= 1:
            return 0.0
        
        # Compute degree matrix
        degree_matrix = np.diag(similarity_matrix.sum(axis=1))
        
        # Avoid division by zero
        degree_sqrt_inv = np.zeros_like(degree_matrix)
        for i in range(n):
            if degree_matrix[i, i] > 1e-10:
                degree_sqrt_inv[i, i] = 1.0 / np.sqrt(degree_matrix[i, i])
        
        # Compute Laplacian
        laplacian = np.eye(n) - degree_sqrt_inv @ similarity_matrix @ degree_sqrt_inv
        
        # Compute eigenvalues
        self.eigenvalues = np.linalg.eigvals(laplacian)
        self.eigenvalues = np.real(self.eigenvalues)
        self.eigenvalues = np.sort(self.eigenvalues)
        
        # Sum of (1 - lambda_k) for lambda_k <= 1
        uncertainty = sum(max(0, 1 - lam) for lam in self.eigenvalues if lam <= 1)
        return uncertainty
    
    def degree_based_measures(self, similarity_matrix):
        n = similarity_matrix.shape[0]
        
        # Compute degree for each node
        degrees = similarity_matrix.sum(axis=1)
        
        # Normalize degrees
        max_degree = n - 1 if n > 1 else 1
        normalized_degrees = degrees / max_degree
        
        # Uncertainty: average distance from maximum connectivity
        uncertainty = np.mean(1 - normalized_degrees)
        
        # Confidence: individual degrees (higher degree = higher confidence)
        confidence_scores = normalized_degrees
        
        return uncertainty, confidence_scores
    
    def eccentricity_measures(self, similarity_matrix, k = 2):
        n = similarity_matrix.shape[0]
        if n <= 1:
            return 0.0, np.array([1.0] * n)
        # Compute degree matrix
        degree_matrix = np.diag(similarity_matrix.sum(axis=1))
        
        # Avoid division by zero
        degree_sqrt_inv = np.zeros_like(degree_matrix)
        for i in range(n):
            if degree_matrix[i, i] > 1e-10:
                degree_sqrt_inv[i, i] = 1.0 / np.sqrt(degree_matrix[i, i])
        
        # Compute normalized Laplacian
        laplacian = np.eye(n) - degree_sqrt_inv @ similarity_matrix @ degree_sqrt_inv
        
        # Get k smallest eigenvectors
        k = min(k, n-1) if n > 1 else 1
        eigenvals, eigenvecs = eigh(laplacian)
        
        # Use first k eigenvectors for embedding
        embedding = eigenvecs[:, :k]
        
        # Center the embeddings
        centroid = np.mean(embedding, axis=0)
        centered_embedding = embedding - centroid
        
        # Calculate distances from center
        distances = np.linalg.norm(centered_embedding, axis=1)
        
        # Uncertainty: average distance from center
        uncertainty = np.mean(distances)
        
        # Confidence: negative distance (closer to center = higher confidence)
        max_dist = np.max(distances) if np.max(distances) > 0 else 1
        confidence_scores = 1 - (distances / max_dist)
        
        return uncertainty, confidence_scores

In [10]:
uq = UncertaintyQuantifier()

similar_matrix = uq.compute_similarity_matrix(responses=responces)


[{'galileo', 'considered', '1861', 'world', 'various', 'his', 'medicine', 'isaac', 'as', 'curie', '17th', 'model', 'relativity', 'prize', 'progress', 'to', 'existence', 'fundamental', 'cryptography', 'black', 'challenged', 'biases', 'geocentric', 'rabies', 'difficult', 'mechanics', 'cosmos', 'marie', 'profound', 'field', 'was', 'work', 'woman', 'into', '1641', 'single', 'changed', 'biological', 'influenced', 'be', 'provide', 'systems', '6', 'motion', 'this', 'are', 'universe', 'century', 'selection', 'nobel', 'subjective', 'albert', 'introduced', 'microbiology', 'particularly', 'influential', 'observations', 'anthrax', 'breakthroughs', 'today', '1865', 'special', 'ultimately', 'a', 'discovered', 'consider', 'used', 'factors', 'natural', 'method', 'there', 'for', 'inheritance', 'more', 'radium', 'with', '1', 'classical', 'exhaustive', 'in', 'her', 'galilei', 'groundbreaking', 'mathematical', 'development', 'among', '5', 'some', 'scientific', 'intelligence', 'biology', 'you', 'understand

In [11]:
group = uq.num_semantic_sets(similar_matrix)
print(group)

27


In [12]:
un_egiv = uq.eigenvalue_uncertainty(similar_matrix)
print(un_egiv)

2.458827377179198


In [13]:
deg_un, deg_conf = uq.degree_based_measures(similar_matrix)
print(deg_un, deg_conf)

0.5775500543780876 [0.36352459 0.36671241 0.43120783 0.43619253 0.41595039 0.41396406
 0.37173243 0.46053468 0.44478769 0.42552225 0.45316291 0.41283776
 0.40771333 0.42228015 0.42213526 0.42416689 0.41586053 0.45369137
 0.44761822 0.40876533 0.43081671 0.45450492 0.43381444 0.45521169
 0.44289069 0.43232759 0.38112097 0.40879619 0.44514237 0.39051217]


In [14]:
ecc_un, ecc_conf = uq.eccentricity_measures(similar_matrix, k=4)
print(ecc_un, ecc_conf)

0.2818005346641185 [0.20081721 0.         0.78838842 0.76138925 0.56437104 0.60472473
 0.34442149 0.73013095 0.84127759 0.76524667 0.76671206 0.74088967
 0.70469618 0.61786903 0.70683701 0.7122308  0.55968161 0.69958615
 0.62701521 0.62072323 0.77620572 0.75725026 0.6870704  0.80817186
 0.75828659 0.78720239 0.63794068 0.57169098 0.73216144 0.53340671]


In [15]:
class TextPreprocessor:
    
    def __init__(self, language = 'english'):
        self.language = language
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        
        # Try to load spaCy model for advanced processing
        self.nlp = None
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Install with: python -m spacy download en_core_web_sm")
    
    def clean_text(self, text):
        if not text or not isinstance(text, str):
            return ""
        
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Remove or replace special characters and symbols
        text = re.sub(r'[^\w\s\.\!\?\,\:\;\-\'\"]', ' ', text)
        
        # Fix common encoding issues
        text = text.encode('ascii', 'ignore').decode('ascii')
        
        # Normalize unicode characters
        text = unicodedata.normalize('NFKD', text)
        
        return text
    
    def normalize_text(self, text, 
                      lowercase = True,
                      remove_punctuation = False,
                      remove_numbers = False,
                      expand_contractions = True):
        if not text:
            return ""
        
        # Expand contractions
        if expand_contractions:
            text = self._expand_contractions(text)
        
        # Convert to lowercase
        if lowercase:
            text = text.lower()
        
        # Remove numbers
        if remove_numbers:
            text = re.sub(r'\d+', '', text)
        
        # Remove punctuation
        if remove_punctuation:
            text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Clean up extra spaces
        text = re.sub(r'\s+', ' ', text.strip())
        
        return text
    
    def _expand_contractions(self, text):
        contractions = {
            "ain't": "are not", "aren't": "are not", "can't": "cannot",
            "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
            "don't": "do not", "hadn't": "had not", "hasn't": "has not",
            "haven't": "have not", "he'd": "he would", "he'll": "he will",
            "he's": "he is", "i'd": "i would", "i'll": "i will", "i'm": "i am",
            "i've": "i have", "isn't": "is not", "it'd": "it would",
            "it'll": "it will", "it's": "it is", "let's": "let us",
            "shouldn't": "should not", "that's": "that is", "there's": "there is",
            "they'd": "they would", "they'll": "they will", "they're": "they are",
            "they've": "they have", "we'd": "we would", "we're": "we are",
            "we've": "we have", "weren't": "were not", "what's": "what is",
            "where's": "where is", "who's": "who is", "won't": "will not",
            "wouldn't": "would not", "you'd": "you would", "you'll": "you will",
            "you're": "you are", "you've": "you have"
        }
        
        for contraction, expansion in contractions.items():
            text = re.sub(r'\b' + contraction + r'\b', expansion, text, flags=re.IGNORECASE)
        
        return text
    
    def remove_stopwords(self, text, custom_stopwords = None):
        if not text:
            return ""
        
        tokens = word_tokenize(text)
        stop_words = self.stop_words.copy()
        
        if custom_stopwords:
            stop_words.update(custom_stopwords)
        
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        
        return ' '.join(filtered_tokens)
    
    def lemmatize_text(self, text):
        if not text:
            return ""
        
        tokens = word_tokenize(text)
        lemmatized = [self.lemmatizer.lemmatize(token.lower()) for token in tokens]
        
        return ' '.join(lemmatized)
    
    def stem_text(self, text):
        if not text:
            return ""
        
        tokens = word_tokenize(text)
        stemmed = [self.stemmer.stem(token.lower()) for token in tokens]
        
        return ' '.join(stemmed)
    
    def extract_entities(self, text):
        entities = defaultdict(list)
        
        if self.nlp is None:
            return dict(entities)
        
        doc = self.nlp(text)
        for ent in doc.ents:
            entities[ent.label_].append(ent.text)
        
        return dict(entities)
    
    def correct_spelling(self, text):
        if not text:
            return ""
        
        try:
            blob = TextBlob(text)
            return str(blob.correct())
        except:
            return text
    
    def segment_sentences(self, text):
        if not text:
            return []
        
        return sent_tokenize(text)
    
    def preprocess(self, text, 
                   clean = True,
                   normalize = True,
                   remove_stopwords = False,
                   lemmatize = False,
                   stem = False,
                   correct_spelling = False,
                   custom_stopwords = None,
                   preserve_entities = False):
        if not text or not isinstance(text, str):
            return ""
        
        original_entities = {}
        if preserve_entities:
            original_entities = self.extract_entities(text)
        
        # Step 1: Clean text
        if clean:
            text = self.clean_text(text)
        
        # Step 2: Spelling correction (before other processing)
        if correct_spelling:
            text = self.correct_spelling(text)
        
        # Step 3: Normalize text
        if normalize:
            text = self.normalize_text(text, 
                                     lowercase=True,
                                     remove_punctuation=False,
                                     remove_numbers=False,
                                     expand_contractions=True)
        
        # Step 4: Remove stopwords
        if remove_stopwords:
            text = self.remove_stopwords(text, custom_stopwords)
        
        # Step 5: Lemmatize or stem (mutually exclusive)
        if lemmatize and not stem:
            text = self.lemmatize_text(text)
        elif stem and not lemmatize:
            text = self.stem_text(text)
        elif lemmatize and stem:
            print("Warning: Both lemmatize and stem are True. Using lemmatization only.")
            text = self.lemmatize_text(text)
        
        # Step 6: Final cleanup
        text = re.sub(r'\s+', ' ', text.strip())
        
        return text

In [16]:
class NLITextSimilarity:
    
    def __init__(self, model_name = "microsoft/deberta-v2-xlarge-mnli", 
                 enable_preprocessing = True, 
                 preprocessing_config = None):
        self.model_name = model_name
        self.enable_preprocessing = enable_preprocessing
        
        # Default preprocessing configuration
        self.preprocessing_config = {
            'clean': True,
            'normalize': True,
            'remove_stopwords': False,
            'lemmatize': False,
            'stem': False,
            'correct_spelling': False,
            'preserve_entities': False
        }
        
        # Update with user config
        if preprocessing_config:
            self.preprocessing_config.update(preprocessing_config)
        
        # Initialize preprocessor
        if self.enable_preprocessing:
            self.preprocessor = TextPreprocessor()
        
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()
        
        # Label mapping for MNLI models
        self.label_mapping = {
            'CONTRADICTION': 0,
            'NEUTRAL': 1, 
            'ENTAILMENT': 2
        }
    
    def preprocess_text(self, text, custom_config = None):
        if not self.enable_preprocessing:
            return text
        
        config = self.preprocessing_config.copy()
        if custom_config:
            config.update(custom_config)
        
        return self.preprocessor.preprocess(text, **config)
        
    def get_nli_score(self, premise, hypothesis, 
                     preprocess = None):
        # Apply preprocessing if enabled
        if preprocess is None:
            preprocess = self.enable_preprocessing
        
        if preprocess and self.enable_preprocessing:
            premise = self.preprocess_text(premise)
            hypothesis = self.preprocess_text(hypothesis)
        
        # Tokenize the input
        inputs = self.tokenizer(premise, hypothesis, 
                              return_tensors="pt", 
                              truncation=True, 
                              max_length=512,
                              padding=True)
        
        # Get model predictions
        with torch.no_grad():
            outputs = self.model(**inputs)
            probabilities = torch.softmax(outputs.logits, dim=-1)
            
        # Convert to numpy and extract probabilities
        probs = probabilities.squeeze().numpy()
        
        return {
            'contradiction': float(probs[0]),
            'neutral': float(probs[1]),
            'entailment': float(probs[2])
        }
    
    def bidirectional_similarity(self, text1, text2, 
                                preprocess = None):
        # Get NLI scores in both directions
        scores_1_to_2 = self.get_nli_score(text1, text2, preprocess)
        scores_2_to_1 = self.get_nli_score(text2, text1, preprocess)
        
        # Average the entailment probabilities
        similarity = (scores_1_to_2['entailment'] + scores_2_to_1['entailment']) / 2
        
        return similarity
    
    def compute_similarity_matrix(self, responses, preprocess = None):
        n = len(responses)
        similarity_matrix = np.zeros((n, n))
        
        # Preprocess all responses
        self.word_sets = [self.preprocess_sentence(sentence) for sentence in responses]
        print(self.word_sets)
        
        # Calculate pairwise similarities
        for i in range(n):
            for j in range(i, n):
                if i == j:
                    similarity_matrix[i, j] = 1.0  # Self-similarity is 1
                else:
                    scores_1_to_2 = self.get_nli_score(responces[i], responces[j], preprocess)
                    scores_2_to_1 = self.get_nli_score(responces[j], responces[i], preprocess)
                    similarity_matrix[i, j] = similarity_matrix[j, i] = (scores_1_to_2['entailment'] + scores_2_to_1['entailment']) / 2
        
        return similarity_matrix

In [17]:
nli_sim_model_1 = NLITextSimilarity(model_name='cross-encoder/nli-roberta-base')
sim_matrix = nli_sim_model_1.compute_similarity_matrix(responses=responces)

Loading model: cross-encoder/nli-roberta-base


RuntimeError: Failed to import transformers.models.roberta.modeling_roberta because of the following error (look up to see its traceback):
Traceback (most recent call last):
  File "c:\Users\Hobhav Fulzele\myenv\lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.