In [1]:
from embeddings import EmbeddingsResponder
from entity_extraction import Extractor
from factual import FactualResponder
from data_repository import DataRepository
from intent_classifier import IntentClassifier

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_repository = DataRepository()



In [3]:
intent_classifier = IntentClassifier(data_repository)
extractor = Extractor(data_repository)
embeddings = EmbeddingsResponder(data_repository, extractor, intent_classifier)
factual = FactualResponder(data_repository, extractor, intent_classifier)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
label_list = list(data_repository.get_rel2lbl().values())
print(label_list)

['cast member', 'MPAA film rating', 'FSK film rating', 'BBFC rating', 'RARS rating', 'Kijkwijzer rating', 'ClassInd rating', 'INCAA film rating', 'IMDb ID', 'KMRB film rating', 'FPB rating', 'KAVI rating', 'Medierådet rating', 'RCQ classification', 'OFLC classification', 'IFCO rating', 'applies to jurisdiction', 'field of work', 'conferred by', 'crew member(s)', 'native language', 'director / manager', 'relative', 'film editor', 'medical condition', 'product or material produced', 'occupation', 'student of', 'employer', 'from narrative universe', 'founded by', 'political ideology', 'home world', 'place of burial', 'manner of death', 'publisher', 'facet of', 'owned by', 'depicted by', 'located in the administrative territorial entity', 'partner in business or sport', 'described by source', 'participant in', 'winner', 'movement', 'genre', 'replaces', 'replaced by', 'operator', 'capital of', 'named after', 'partially coincident with', 'convicted of', 'religion', 'nominated for', 'language

In [5]:
# Create an embeddings model over this list such that we can ientify user intent based on the embeddings of a given sentence.

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
embeddings = model.encode(label_list)

In [7]:
# use nltk to remove stop words and lemmatize the labels
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [10]:
import nltk 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

query="Who is the director of the movie?"
query_tokens = word_tokenize(query)
query_tokens = [lemmatizer.lemmatize(token) for token in query_tokens if token not in stop_words]
query_tokens = [token for token in query_tokens if token.isalnum()]
query = ' '.join(query_tokens)
print(query)
query_embedding = model.encode([query])
print(query_embedding)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/omkaringale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omkaringale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/omkaringale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/omkaringale/nltk_data'
    - '/opt/miniconda3/envs/atai/nltk_data'
    - '/opt/miniconda3/envs/atai/share/nltk_data'
    - '/opt/miniconda3/envs/atai/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Tuple

class TextClassifier:
    def __init__(self, categories: List[str], model_name: str = 'all-MiniLM-L6-v2'):
        """
        Initialize the classifier with categories and load the embedding model.
        
        Args:
            categories: List of category labels
            model_name: Name of the sentence-transformers model to use
        """
        self.categories = categories
        self.model = SentenceTransformer(model_name)
        # Pre-compute embeddings for all categories
        self.category_embeddings = self.model.encode(categories)
        
    def classify(self, query: str, top_k: int = 1) -> List[Tuple[str, float]]:
        """
        Classify the input query and return top_k most similar categories with scores.
        
        Args:
            query: Input text to classify
            top_k: Number of top matches to return
            
        Returns:
            List of tuples containing (category, similarity_score)
        """
        # Get embedding for the query
        query_embedding = self.model.encode([query])[0]
        
        # Calculate cosine similarity with all category embeddings
        similarities = np.dot(self.category_embeddings, query_embedding) / (
            np.linalg.norm(self.category_embeddings, axis=1) * np.linalg.norm(query_embedding)
        )
        
        # Get top_k matches
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        return [(self.categories[idx], similarities[idx]) for idx in top_indices]

# Example usage
if __name__ == "__main__":
    # Define categories
    categories = label_list
    
    # Initialize classifier
    classifier = TextClassifier(categories)
    
    # Example queries
    test_queries = [
        "who is the director of",
        "when was it released",
        "how much did it make",
        "who stars in",
        "what type of movie is",
        "who is the author of this book",
        "did this movie win any nominations"
    ]
    
    # Test classification
    for query in test_queries:
        results = classifier.classify(query, top_k=2)
        print(f"\nQuery: {query}")
        for category, score in results:
            print(f"Category: {category}, Score: {score:.4f}")


Query: who is the director of
Category: director, Score: 0.7266
Category: art director, Score: 0.6899

Query: when was it released
Category: publication date, Score: 0.3693
Category: original film format, Score: 0.3171

Query: how much did it make
Category: production company, Score: 0.3031
Category: production designer, Score: 0.2906

Query: who stars in
Category: cast member, Score: 0.5475
Category: IMDb ID, Score: 0.5046

Query: what type of movie is
Category: original film format, Score: 0.5142
Category: film editor, Score: 0.4918

Query: who is the author of this book
Category: author, Score: 0.6468
Category: notable work, Score: 0.4575

Query: did this movie win any nominations
Category: nominated for, Score: 0.6339
Category: EIRIN film rating, Score: 0.4111
