In [1]:
from transformers import pipeline

# SENTIMENT ANALYSIS

In [5]:
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [6]:
sentiment_classifier("I want to become a good AI Engineer")

[{'label': 'POSITIVE', 'score': 0.9580889344215393}]

In [7]:
# NAMED ENTITY RECOGNITION

In [8]:
ner = pipeline("ner", model = "dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [18]:
ner("My name is Guntash Kaur")

[{'entity': 'B-PER',
  'score': 0.99925154,
  'index': 4,
  'word': 'Gun',
  'start': 11,
  'end': 14},
 {'entity': 'I-PER',
  'score': 0.80489093,
  'index': 5,
  'word': '##tas',
  'start': 14,
  'end': 17},
 {'entity': 'I-PER',
  'score': 0.9592025,
  'index': 6,
  'word': '##h',
  'start': 17,
  'end': 18},
 {'entity': 'I-PER',
  'score': 0.9992207,
  'index': 7,
  'word': 'Ka',
  'start': 19,
  'end': 21},
 {'entity': 'I-PER',
  'score': 0.6212945,
  'index': 8,
  'word': '##ur',
  'start': 21,
  'end': 23}]

# PRE-TRAINED TOKENIZER 

In [20]:
from transformers import AutoTokenizer

In [50]:
model = "bert-base-uncased"

In [51]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [52]:
sentence = "I am so excited to be learning about large language model"

In [53]:
input_ids = tokenizer(sentence)

In [54]:
print(input_ids)

{'input_ids': [101, 1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 2944, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [55]:
tokens = tokenizer.tokenize(sentence)

In [56]:
print(tokens)

['i', 'am', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'model']


In [57]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [58]:
print(token_ids)

[1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 2944]


In [59]:
decode_ids = tokenizer.decode(token_ids)

In [60]:
print(decode_ids)

i am so excited to be learning about large language model


In [61]:
tokenizer.decode(101)

'[CLS]'

In [62]:
tokenizer.decode(102)

'[SEP]'

In [63]:
model2 = "xlnet-base-cased"

In [64]:
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [65]:
input_ids = tokenizer2(sentence)

In [66]:
print(input_ids)

{'input_ids': [35, 569, 102, 5564, 22, 39, 1899, 75, 392, 1243, 1342, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [67]:
tokens = tokenizer2.tokenize(sentence)

In [68]:
print(tokens)

['▁I', '▁am', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁model']


In [70]:
token_ids = tokenizer2.convert_tokens_to_ids(tokens)

In [71]:
print(token_ids)

[35, 569, 102, 5564, 22, 39, 1899, 75, 392, 1243, 1342]


In [73]:
tokenizer2.decode(4)

'<sep>'

# HUGGING FACE PYTORCH/ TENSORFLOW

In [91]:
from transformers import AutoModelForSequenceClassification

In [92]:
import torch

In [93]:
print(sentence)

I am so excited to be learning about large language model


In [94]:
print(input_ids)

{'input_ids': [35, 569, 102, 5564, 22, 39, 1899, 75, 392, 1243, 1342, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [95]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [96]:
input_ids_pt = tokenizer(sentence, return_tensors = "pt")

In [97]:
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 2572, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 2944,
          102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [98]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [99]:
with torch.no_grad():
    logits = model(**input_ids_pt).logits


In [100]:
predicted_class_id = logits.argmax().item()

In [101]:
model.config.id2label[predicted_class_id]

'POSITIVE'