# Pipelines
A pre trained model pipeline method to directly perform the required task on the given or default model.

In [None]:
from transformers import pipeline

In [None]:
# classifies a sentimental text
classifier = pipeline(task="sentiment-analysis")

In [None]:
classifier_res1 = classifier("If something is bad then it is good!")
classifier_res2 = classifier("If something is good then it is bad!")

print(f'{classifier_res1}\n{classifier_res2}')

In [None]:
summarizer = pipeline(task='summarization')

In [None]:
f = open("./data/chapter_1/Still_I_Rise.txt")
still_i_rise = f.read()

summarizer(still_i_rise)

In [None]:
# classifies over the given labels
zero_shot_classifier = pipeline("zero-shot-classification")

In [None]:
zero_shot_classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

In [None]:
generator = pipeline("text-generation")

In [None]:
# generate two sentences of 15 words each.
generator_results = generator(
    'This laptop is the',
    num_return_sequences = 2,
    max_length=15,
    max_new_tokens=15,
    truncation=True
)

for generator_res in generator_results:
    print(generator_res)

In [None]:
# fill in the blank also here unlike above we using a specific model and not the default
# Models can be found here, Select the task on the left side and then model on the right:
# https://huggingface.co/models?pipeline_tag=fill-mask&sort=trending
unmasker = pipeline(task="fill-mask", model='google-bert/bert-base-uncased')

In [None]:
# top_k specifies how many possibilities you want to be displayed
possibilities = unmasker("This is the [MASK] thing in this world.", top_k=4)
for possibility in possibilities:
    print(possibility) 

In [None]:
# The Named entity recognition (NER) task is used for linking the input text to their correspond to entities, 
# such as persons, locations, or organizations.
# Here, we are using grouped_entities=True because we want to tell the pipeline to regroup together the parts
#  of the sentence that correspond to the same entity, e.g.,
# For a text with "Hugging Face", the NER needs to recognise "Hugging” and “Face” as a single organization, 
# even though the name consists of multiple words
ner = pipeline("ner", grouped_entities=True)

In [None]:
recognitions = ner("My name is Sylvain and I work at Hugging Face in Brooklyn.")
for recognition in recognitions:
    print(recognition)

In [None]:
# NER example in part of speech tagging. Part-of-speech (POS) tagging is the process of
#  labeling each word in a text with its corresponding grammatical category, such as noun,
#  verb, adjective, or adverb, considering both its definition and its context within a sentence.
# https://huggingface.co/QCRI/bert-base-multilingual-cased-pos-english
ner_pos = pipeline(task='ner', model='QCRI/bert-base-multilingual-cased-pos-english')

In [None]:
recognitions_pos = ner_pos("Leo is a cat and he lives with his owner in a flat in London.")

for recognition_pos in recognitions_pos:
    print(recognition_pos)

In [None]:
# The question-answering pipeline answers questions from a given context. So,
# it does generate a new answer and simply extract the info from the given context.
question_answerer = pipeline("question-answering")

In [None]:
question_answerer(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

In [None]:
# translate from french to english, with the given model:
# https://huggingface.co/Helsinki-NLP/opus-mt-fr-en
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")

In [None]:
translator("je m'appelle")

In [None]:
# image classification model to classify object in it. We are using this model here:
# https://huggingface.co/google/vit-base-patch16-224
image_classifier = pipeline(
    task="image-classification", model="google/vit-base-patch16-224"
)

In [None]:
results = image_classifier("./data/chapter_1/Ok_I_Pull_Up.jpg")

# it should be capybara,but I guess they didn't train the model on capybara images that much
for res in results:
    print(res)

In [None]:
# translate from audio to text
transcriber = pipeline(
    task="automatic-speech-recognition", model="openai/whisper-large-v3"
)

>Note: you will also need to have ffmpeg in system for the  **automatic-speech-recognition** task pipeline to work. Have a look here regarding ffmpeg:<br />
https://ffmpeg.org/download.html

In [None]:
transcriber(
    './data/chapter_1/please_please.mp3'
)

# Tokenisers
Tokenisers are mainly used for the task of converting raw input text to a machine readable input. Mainly, there are three type of tokenisation methods:
- Word Based Tokenisation
- Character Based Tokenisation
- Subword Based Tokenisation

Even though both Word and Character based tokenisation are very intuitive, they both come with their own limitations; in word-based, very large vocabulaties size leads to memory and time complexity and reducing the vocabulary can lead to large quantity of out-of-vocabulary tokens, also there is loss of meaning across very simalar words (for example, let, let's), and in the character-based, very long character sequences, less meaningful individual character tokens compared to a word token (e.g., 't' compare to 'take'). Therefore, a middle ground of both the tecnhiques is preferred, also known as  Subword Based Tokenisation. <br />
Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords. For instance "annoyingly" might be considered a rare word and could be decomposed into "annoying" and "ly". Both "annoying" and "ly" as stand-alone subwords would appear more frequently while at the same time the meaning of "annoyingly" is kept by the composite meaning of "annoying" and "ly". Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful context-independent representations. In addition, subword tokenization enables the model to process words it has never seen before, by decomposing them into known subwords

In [None]:
# subword based tokenisation
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
tokenizer.tokenize("Now I know about tokenisation!")

> Note: *##* means that this token should be attached to the previous one.

After tokenisation, the embedding conversion process take please which converts the tokens into number.

# Bias And Limitations

Using pretrained model to fine tune always comes with the bias over the data they were originally trained on and also their archetecture limitations, and so, fine tuning the model on your data won’t make this intrinsic bias disappear.

In [None]:
from transformers import pipeline

In [None]:
unmasker = pipeline("fill-mask", model="bert-base-uncased")

In [None]:
result = unmasker("This man works as a [MASK].", top_k=5)
print([r["token_str"] for r in result])

result = unmasker("This woman works as a [MASK].", top_k=5)
print([r["token_str"] for r in result])

# The End!