<a href="https://colab.research.google.com/github/M-T00/NLP_Codes/blob/example/Examples_For_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
from transformers import pipeline



In [None]:
#simple example "sentiment-analysis":
result = classifier("I hope you get better")
classifier = pipeline("sentiment-analysis")
print(result)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9410645365715027}]


In [None]:
#simple example "text-generation":
generator = pipeline("text-generation", model="distilgpt2")

result = generator(
    "I will go to the",
    max_length = 20,
    num_return_sequences = 3,
)
print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I will go to the office of the Secretary of State within three weeks of the inauguration."\n\n'}, {'generated_text': 'I will go to the store next week and find the deal you are looking for and who will be'}, {'generated_text': 'I will go to the future to create a permanent state where I will continue to support the Constitution.'}]


In [None]:
#simple example "zero-shot-classification"(label probability)
classifier = pipeline("zero-shot-classification")

result = classifier(
    "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.",
    candidate_labels = ["NLP", "education", "news"],
)
print(result)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


(…)bart-large-mnli/resolve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

(…)-mnli/resolve/main/tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

(…)/bart-large-mnli/resolve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

(…)/bart-large-mnli/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)t-large-mnli/resolve/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.', 'labels': ['news', 'NLP', 'education'], 'scores': [0.9738209247589111, 0.017573373392224312, 0.008605724200606346]}


In [None]:
#semi-pro example "sentiment-analysis" with details:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer)

sequence = "I hope you get better"
res_tokenizer = tokenizer(sequence)
print(res_tokenizer)

res_tokens = tokenizer.tokenize(sequence)
print(res_tokens)

res_ids = tokenizer.convert_tokens_to_ids(res_tokens)
print(res_ids)

res_decoded_string = tokenizer.decode(res_ids)
print(res_decoded_string)

{'input_ids': [101, 1045, 3246, 2017, 2131, 2488, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
['i', 'hope', 'you', 'get', 'better']
[1045, 3246, 2017, 2131, 2488]
i hope you get better


In [None]:
#combine with pytorch and tensorflow
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model = model, tokenizer = tokenizer)

x_train = ["We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.", "I hope you get better"]
result = classifier(x_train)
print(result)

batch = tokenizer(x_train, padding = True, truncation = True, max_length = 512, return_tensors = "pt")
print(batch)

with torch.no_grad():
  outputs = model(**batch)
  print(outputs)
  predictions = F.softmax(outputs.logits, dim = 1)
  print(predictions)
  labels = torch.argmax(predictions, dim = 1)
  print(labels)

[{'label': 'NEGATIVE', 'score': 0.8124731183052063}, {'label': 'POSITIVE', 'score': 0.9410645365715027}]
{'input_ids': tensor([[  101,  2057,  8970,  1037,  2047,  2653,  6630,  2944,  2170, 14324,
          1010,  2029,  4832,  2005,  7226,  7442,  7542,  2389,  4372, 16044,
          2099, 15066,  2013, 19081,  1012,   102],
        [  101,  1045,  3246,  2017,  2131,  2488,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[ 0.7855, -0.6807],
        [-1.3518,  1.4188]]), hidden_states=None, attentions=None)
tensor([[0.8125, 0.1875],
        [0.0589, 0.9411]])
tensor([0, 1])
