In [36]:
from transformers import pipeline, AutoConfig, DistilBertForQuestionAnswering, AutoTokenizer

In [22]:
### What do I want to do?

## Use a transformer model to show how you can "train" your own chat bot

## Need a transformer "pre-trained" for question answering. 


## Nice to pass in a data set


In [23]:
## pipeline is a object from the huggyface 'transformers' library that can instantiate different model objects. 
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

answer = ("Because it is a wonderful city filled with possibility. " 
          "It contains lots of restaurants, bars, single women"
          "etc")
## pipeline accepts any model found in the huggyface "hub". That is a lot of models - selection can be overwhelming
## Fortunately, there are filtering options and stars to indicate popular repos. The model we are using above
## is one of top-rated for 'question-answering' NLP solutions

result = question_answerer(question="Why should Chris stay in Chicago?", context=answer)

print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")


Answer: 'it is a wonderful city filled with possibility', score: 0.3579, start: 8, end: 54


In [None]:
## How can I train a model with a data set?
# Because the model is already pre-trained, it is able to arrive at an answer. However, we may want to improve it on our own data


In [None]:
## Using a pipeline vs using a "tokenizer" and "model" class

# A pipeline can be passed a model name, OR it can be passed a model and a tokenizer. This "pre-loads" the model and tokenzier


## What is torch providing us? Why do we need it? Is it a hugging face dependency?


In [37]:
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')

#This loads the model / tokenizer ahead of time - the pipeline does hti sfor you, but this allows you to preload it!!
#Sounds like you can have configuration too??

In [28]:
bert_config = AutoConfig.from_pretrained('distilbert-base-cased-distilled-squad', attention_dropout=0.2)
print(bert_config)

DistilBertConfig {
  "_name_or_path": "distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.2,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 28996
}



In [38]:
## Is it worthwhile to mention the "attention mask" return of tokenizer
generator = pipeline('question-answering', model=model, tokenizer=tokenizer)


In [39]:
answer = ("Because it is a wonderful city filled with possibility. " 
          "It contains lots of restaurants, bars, single women"
          "etc")
## pipeline accepts any model found in the huggyface "hub". That is a lot of models - selection can be overwhelming
## Fortunately, there are filtering options and stars to indicate popular repos. The model we are using above
## is one of top-rated for 'question-answering' NLP solutions

result = generator(question="Why should Chris stay in Chicago?", context=answer)

print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")


Answer: 'it is a wonderful city filled with possibility', score: 0.3579, start: 8, end: 54


In [None]:
# Received same answer from running it against "model". How else would I know the model is not correct??