In [34]:
!pip install sacremoses # for Herbert

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [36]:
from transformers import BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel, HerbertTokenizer, RobertaModel
import torch
class bert_similarity:
  def __init__(self, pretrained_name, tokenizer_class, model_class, tokenizer_name=None):
    self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
    self.pretrained_name = pretrained_name
    self.tokenizer_name = tokenizer_name
    if self.tokenizer_name is None:
      self.tokenizer_name = self.pretrained_name
    self.tokenizer = tokenizer_class.from_pretrained(self.tokenizer_name)
    self.model = model_class.from_pretrained(self.pretrained_name).to(self.device)
    self.__similarity = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

  def embedding(self, text):
    encoded_input = self.tokenizer(text, return_tensors='pt').to(self.device)
    output = self.model(**encoded_input)
    return output['last_hidden_state'][0,-1,:]

  def similarity(self, text1, text2):
    embed1 = self.embedding(text1)
    embed2 = self.embedding(text2)
    sim = self.__similarity(embed1, embed2)
    return sim

In [35]:
sim = bert_similarity("bert-base-uncased", BertTokenizer, BertModel)
sim.similarity("Replace me by any text you'd like.", "Replace me by any text you'd hate.")

tensor(0.9866, device='cuda:0', grad_fn=<SumBackward1>)

In [6]:
%%timeit -n 1000
sim.similarity("Replace me by any text you'd like.", "Replace me by any text you'd hate.")

35.1 ms ± 4.33 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)


0.01s = 10ms na pare, bert ~35.1ms, 35.1 ms ± 4.33 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
sim2 = bert_similarity("distilbert-base-uncased", DistilBertTokenizer, DistilBertModel)
sim2.similarity("Replace me by any text you'd like.", "Replace me by any text you'd hate.")

tensor(0.9849, device='cuda:0', grad_fn=<SumBackward1>)

In [None]:
%%timeit -n 1000
sim2.similarity("Replace me by any text you'd like.", "Replace me by any text you'd hate.")

17.9 ms ± 1.19 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)

In [37]:
sim3 = bert_similarity("allegro/herbert-klej-cased-v1", HerbertTokenizer, RobertaModel, tokenizer_name="allegro/herbert-klej-cased-tokenizer-v1")

tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/591k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMTokenizer'. 
The class this function is called from is 'HerbertTokenizer'.


config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/500M [00:00<?, ?B/s]

In [41]:
%%timeit -n 100
sim3.similarity("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", "Nie musisz, więc nie idź, do tej głupiej szkoły.")

257 ms ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
!pip install hugchat

Collecting hugchat
  Downloading hugchat-0.3.8-py3-none-any.whl (30 kB)
Collecting requests-toolbelt (from hugchat)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: requests-toolbelt, hugchat
Successfully installed hugchat-0.3.8 requests-toolbelt-1.0.0


In [15]:
from hugchat import hugchat
from hugchat.login import Login

email = ""
password = ""
# Log in to huggingface and grant authorization to huggingchat
sign = Login(email, password)
cookies = sign.login()

# Save cookies to the local directory
cookie_path_dir = "./cookies_snapshot"
sign.saveCookiesToDir(cookie_path_dir)

# Load cookies when you restart your program:
# sign = login(email, None)
# cookies = sign.loadCookiesFromDir(cookie_path_dir) # This will detect if the JSON file exists, return cookies if it does and raise an Exception if it's not.

# Create a ChatBot
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())  # or cookie_path="usercookies/<email>.json"

# non stream response
query_result = chatbot.query("Hi!")
print(query_result) # or query_result.text or query_result["text"]

# stream response
for resp in chatbot.query(
    "Hello",
    stream=True
):
    print(resp)

# Use web search (new feature)
query_result = chatbot.query("Hi!", web_search=True)
print(query_result) # or query_result.text or query_result["text"]
for source in query_result.web_search_sources:
    print(source.link)
    print(source.title)
    print(source.hostname)

 Hello! It's nice to meet you. Is there something I can help you with or would you like to chat for a bit?
{'type': 'stream', 'token': ' Hi'}
{'type': 'stream', 'token': '!'}
{'type': 'stream', 'token': ' How'}
{'type': 'stream', 'token': ' are'}
{'type': 'stream', 'token': ' you'}
{'type': 'stream', 'token': ' today'}
{'type': 'stream', 'token': '?'}
{'type': 'stream', 'token': ' Is'}
{'type': 'stream', 'token': ' there'}
{'type': 'stream', 'token': ' anything'}
{'type': 'stream', 'token': ' you'}
{'type': 'stream', 'token': "'"}
{'type': 'stream', 'token': 'd'}
{'type': 'stream', 'token': ' like'}
{'type': 'stream', 'token': ' to'}
{'type': 'stream', 'token': ' talk'}
{'type': 'stream', 'token': ' about'}
{'type': 'stream', 'token': ' or'}
{'type': 'stream', 'token': ' ask'}
{'type': 'stream', 'token': ' me'}
{'type': 'stream', 'token': '?'}
{'type': 'stream', 'token': ' I'}
{'type': 'stream', 'token': "'"}
{'type': 'stream', 'token': 'm'}
{'type': 'stream', 'token': ' here'}
{'type'

In [21]:
import random
x = [random.random() for i in range(10)]

In [24]:
%%timeit
for i in x:
  query_result = chatbot.query("Hi!"+str(i))

48.6 µs ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


35.1 ms ± 4.33 ms per loop (mean ± std. dev. of 7 runs, 1000 loops each)
