## Transformers

- ### Quick Tour

In [2]:
# requirements.txt
# pytorch
# transformer

In [3]:
import warnings
warnings.filterwarnings("ignore")

### transformers pipeline sentiment analysis

In [4]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

2024-12-01 20:20:21.062630: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 20:20:21.285701: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-01 20:20:21.423137: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733084421.656818   38800 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733084421.713977   38800 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 20:20:22.297818: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [5]:
## sentiment analysis of this sentence
classifier("This course is good but lots of things are wors like the don't provide notes and resources.")

[{'label': 'NEGATIVE', 'score': 0.9978156089782715}]

In [6]:
## sentiment analysis of this sentence
classifier("john is good student but he smokes and drink at the age of 13.")

[{'label': 'NEGATIVE', 'score': 0.9474365711212158}]

In [7]:
results = classifier(["we are happy to see the transformes library.", "we hope you don't hate it"])
for result in results:
    print("sentence is: {} and it's positive score is: {}".format(result['label'], result['score']))

sentence is: POSITIVE and it's positive score is: 0.9998522996902466
sentence is: POSITIVE and it's positive score is: 0.8319636583328247


## iterate over the entire datasets

In [8]:
from transformers import pipeline
import torch

## model for speech recognition with audio
speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

## load data
from datasets import load_dataset, Audio

# dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
# dataset = dataset.cast_column("audio", Audio(sampling_ratio=speech_recognizer.feature_extraction.sampling_rate))

# result = speech_recognizer(dataset[:4]["audio"])
# print([d["text"] for d in result])

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenizer in pipeline

In [9]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

from transformers import AutoTokenizer, AutoModelForSequenceClassification
## model 1 to tokenize sequences

## AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name)
## model 2 to tokenize sequences
tokenizer = AutoTokenizer.from_pretrained(model_name)

## classifier
classifier1 = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

print(classifier1("This was good tour for me but i will give less rate for this tour guidence."))

print(classifier1("This was really great tour for me and i have really enjoyed with this tour guidence."))


[{'label': '3 stars', 'score': 0.5468131899833679}]
[{'label': '5 stars', 'score': 0.8103119134902954}]


In [10]:
## AutoTokenizer

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoding = tokenizer("We are happy to see the transforms library.")
for rows in encoding.items():
    print(rows)

('input_ids', [101, 11312, 10320, 19308, 10114, 11811, 10103, 79917, 10107, 13299, 119, 102])
('token_type_ids', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
('attention_mask', [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [11]:
## pytorch
pt_batch = tokenizer(
    ["we are happy to see the transformes library."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt" ## here pt is pytorch. in tensorflow we will use tf
)


In [12]:
from transformers import AutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

pt_output = pt_model(**pt_batch)

pt_output

SequenceClassifierOutput(loss=None, logits=tensor([[-2.6372, -2.4231, -0.1310,  2.0224,  2.3549]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
## finally use the softmax function to retrieve the probability

from torch import nn

pt_prediction = nn.functional.softmax(pt_output.logits, dim=1)
print(pt_prediction)

 

tensor([[0.0037, 0.0046, 0.0459, 0.3950, 0.5508]], grad_fn=<SoftmaxBackward0>)


: 

## tensorflow

In [None]:
from transformers import TFAutoModelForSequenceClassification

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tf_batch = tokenizer(
    ["we are happy to see the transformes library."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf" ## here pt is pytorch. in tensorflow we will use tf
)

tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

tf_output = tf_model(tf_batch)




2024-12-01 20:20:45.834036: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2024-12-01 20:20:47.498710: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
2024-12-01 20:20:50.377371: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 325260288 exceeds 10% of free system memory.
