In [None]:
from embeddings import EmbeddingsResponder
from entity_extraction import Extractor
from factual import FactualResponder
from data_repository import DataRepository
from intent_classifier import IntentClassifier, EmbeddingBasedIntentClassifier, MLPBasedIntentClassifier

In [None]:
data_repository = DataRepository() 
intent_classifier_emb = EmbeddingBasedIntentClassifier(data_repository)
intent_classifier_mlp = MLPBasedIntentClassifier(data_repository)
extractor = Extractor(data_repository)
embeddings = EmbeddingsResponder(data_repository, extractor, intent_classifier=intent_classifier_mlp, emb_intent_classifier=intent_classifier_emb)
factual = FactualResponder(data_repository, extractor, emb_intent_classifier=intent_classifier_emb, mlp_intent_classifier=intent_classifier_mlp)

In [None]:
intent_classifier_emb.classify_query("How much did E.T. the Extraterrestrial make at the box office?")

In [None]:
extractor.get_guaranteed_entities("How much did E.T. the Extra-Terrestrial make?")

In [None]:
l = data_repository.get_ner_entities_list()

In [None]:
# fuzzy match a query on the list of entities
from fuzzywuzzy import process
temp_l = extractor.get_guaranteed_entities("How much did E.T. the Extra-Terrestrial make?")
query = "How much did E.T. the Extraterrestrial make at the box office?"
matches = process.extract(query, temp_l, limit=5)

In [None]:
temp_l = extractor.get_guaranteed_entities("How much did E.T. the Extra-Terrestrial which stars Rober Downey Jr and earned a lot make?")

In [None]:
temp_l

In [None]:
matches = process.extract(query, temp_l, limit=5)

In [None]:
matches

In [None]:
query = "How much did 'E.T. the Extraterrestrial' which stars Robert Downey Jr and earned a lot make?"
temp = extractor.get_guaranteed_entities(query)

In [None]:
temp = extractor.extract_ner(query)

In [None]:
temp

In [None]:
# use fuzzy matching to check if identified entities in temp were present in the query 

from fuzzywuzzy import fuzz

for entity in temp:
    print(entity, fuzz.partial_ratio(entity, query))

# check if the entities are present in the query
for entity in temp:
    print(entity, entity in query)


In [1]:
import transformers

model = transformers.pipeline('ner', model='piadelapaz/bert-finetuned-ner-movies', tokenizer='piadelapaz/bert-finetuned-ner-movies')

  from .autonotebook import tqdm as notebook_tqdm
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:
# get entities from the query
query = "How much did E.T. the Extraterrestrial make at the box office?"
entities = model(query)

In [3]:
# print all identified entities
for entity in entities:
    print(entity['entity'], entity['word'])

B-MOVIE e
I-MOVIE .
I-MOVIE t
I-MOVIE .
I-MOVIE the
I-MOVIE extra
I-MOVIE ##ter
I-MOVIE ##rest
I-MOVIE ##rial
I-MOVIE at
I-MOVIE the
I-MOVIE box
I-MOVIE office


In [4]:
from transformers import cached_path, TRANSFORMERS_CACHE
import os

# Print cache directory
print(TRANSFORMERS_CACHE)

# List cached models
for model in os.listdir(TRANSFORMERS_CACHE):
    print(model)

ImportError: cannot import name 'cached_path' from 'transformers' (/opt/miniconda3/envs/atai/lib/python3.10/site-packages/transformers/__init__.py)

In [5]:
from huggingface_hub import scan_cache_dir

# Get cache info
cache_info = scan_cache_dir()

# List models
for repo in cache_info.repos:
    print(f"Repo: {repo.repo_id}")
    print(f"Size: {repo.size_on_disk / 1024**2:.2f}MB")

Repo: openai-community/gpt2
Size: 0.00MB
Repo: sshleifer/distilbart-cnn-12-6
Size: 0.00MB
Repo: roberta-base
Size: 478.30MB
Repo: polyglot_ner
Size: 0.03MB
Repo: bert-base-uncased
Size: 1352.01MB
Repo: bert-base-german-cased
Size: 0.71MB
Repo: sentence-transformers/all-MiniLM-L6-v2
Size: 87.34MB
Repo: distilbert-base-uncased
Size: 256.22MB
Repo: dslim/bert-base-NER
Size: 413.42MB
Repo: piadelapaz/bert-finetuned-ner-movies
Size: 415.64MB
Repo: gpt2
Size: 525.44MB
Repo: facebook/bart-large-mnli
Size: 1556.54MB
Repo: distilbert-base-uncased-distilled-squad
Size: 253.84MB
Repo: t5-small
Size: 232.91MB
Repo: sentence-transformers/multi-qa-mpnet-base-dot-v1
Size: 418.36MB
Repo: dbmdz/bert-large-cased-finetuned-conll03-english
Size: 1272.79MB
Repo: joeddav/xlm-roberta-large-xnli
Size: 0.00MB
Repo: distilbert-base-uncased-finetuned-sst-2-english
Size: 255.66MB
Repo: sentence-transformers/all-mpnet-base-v2
Size: 0.01MB
