This notebook is created to learn and practice Transformers library from Hugging Face.

Feel free to comment and share your ideas.

Please upvote if this notebook helped you. Thanks!

In [None]:
import transformers

#Set to avoid warning messages.
transformers.logging.set_verbosity_error()

# Pre-built pipelines

In [None]:
# transformers library has various pre-biult pipelines for various NLP and CV tasks
from transformers.pipelines import PIPELINE_REGISTRY

#Get the list of tasks that are supported by Huggingface pipeline

pip_list = PIPELINE_REGISTRY.get_supported_tasks()
for i in pip_list: print(i)

In [None]:
#Get information about a specific task, pt - PyTorch, tf - TensorFlow
print("\nDefault Model for Sentiment Analysis: ")
print(PIPELINE_REGISTRY.check_task('sentiment-analysis')[1].get('default'))

# Sentiment Analysis

In [None]:
from transformers import pipeline
import os

#Load a pipeline. This will download the model checkpoint from huggingface and cache it 
#locally on disk. If model is already available in cache, it will simply use the cached version
#Download will usually take a long time, depending on network bandwidth

sentiment_classifier = pipeline("sentiment-analysis")

#Cache usually available at : <<user-home>>.cache\huggingface\hub

cache_dir = os.path.expanduser('~') + "/.cache/huggingface/hub"
print("Huggingface Cache directory is : ", cache_dir)

#Contents of cache directory
os.listdir(cache_dir)

In [None]:
#Predict sentiment using the pipeline
sentiment_results=sentiment_classifier("This is a great course")
print(sentiment_results)

In [None]:
#A second example
sentiment_results=sentiment_classifier("The download speed is really bad")
print(sentiment_results)

In [None]:
# using a specific model in the pipeline
sentiment_classifier = pipeline(task="sentiment-analysis",
                                model="finiteautomata/bertweet-base-sentiment-analysis")

sentiment_result=sentiment_classifier("This is a great course")

print(sentiment_result)

#Contents of cache directory
os.listdir(cache_dir)

# Named Entity Recognition

In [None]:
# extract NERs from text (person, company, location, date, custom, order_number, etc)
from transformers import pipeline

input_text="Sam went to California on the 23rd of August. \
There, he visited Google headquarters with John Smith and bought a cap for $23"

basic_ner = pipeline("ner")

basic_ner(input_text)

In [None]:
#Print model architecture
print(basic_ner.model)

In [None]:
# print model configuration
print(basic_ner.model.config)

In [None]:
# Using a Custom Model and tokenizer
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates", 
                                          from_pt=True)

model = TFAutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates",
                                                          from_pt=True)

print(model.config.id2label)

In [None]:
#Prediction
enhanced_ner = pipeline('ner', 
                        model=model, 
                        tokenizer=tokenizer, 
                        aggregation_strategy="simple")
enhanced_ner(input_text)

# Question answering

In [None]:
# importing answering pipeline from Transformers and pre-trained model deepset/minilm-uncased-squad2
# model is trained on domain-specific limited text, fast and accurate

from transformers import pipeline

# providing context for the model
context="""
Earth is the third planet from the Sun and the only astronomical object 
known to harbor life. While large volumes of water can be found 
throughout the Solar System, only Earth sustains liquid surface water. 
About 71% of Earth's surface is made up of the ocean, dwarfing 
Earth's polar ice, lakes, and rivers. The remaining 29% of Earth's 
surface is land, consisting of continents and islands. 
Earth's surface layer is formed of several slowly moving tectonic plates, 
interacting to produce mountain ranges, volcanoes, and earthquakes. 
Earth's liquid outer core generates the magnetic field that shapes Earth's 
magnetosphere, deflecting destructive solar winds.
"""

# activating pipelone
quan_pipeline = pipeline("question-answering", 
                         model="deepset/minilm-uncased-squad2")

# generating answer for our question, based on our context
answer=quan_pipeline(question="How much of earth is land?",
             context=context)
print(answer)

In [None]:
# low confidence answer, we can filter only high confidence answers for our task
print("\nAnother question :")
print(quan_pipeline( question="How are mountain ranges created?",
             context=context))

In [None]:
!pip install evaluate

In [None]:
# the SQuAD metric - to evaluate QA models (predicted vs correct answer) using multiple metrics

from evaluate import load
squad_metric = load("squad_v2")

#Ignoring Context & Question as they are not needed for evaluation
#This example is to showcase how the evaluation works based on match between the prediction
#and the correct answer

correct_answer="Paris"

predicted_answers=["Paris",
                 "London",
                 "Paris is one of the best cities in the world"]

cum_predictions=[]
cum_references=[]

# generate predicted answers score vs our correct answer
for i in range(len(predicted_answers)):
    
    #Use the input format for predictions
    predictions = [{'prediction_text':predicted_answers[i], 
                    'id': str(i),
                    'no_answer_probability': 0.}]
    cum_predictions.append(predictions[0])
    
    #Use the input format for answers
    references = [{'answers': {'answer_start': [1], 
                               'text': [correct_answer]}, 
                   'id': str(i)}]
    cum_references.append(references[0])

    # return the evaluation of our answers
    results = squad_metric.compute(predictions=predictions,
                                   references=references)
    print("F1 is", results.get('f1'), 
          " for answer :", predicted_answers[i])
    
#Compute for cumulative Results, count of answers, etc
cum_results=squad_metric.compute(predictions=cum_predictions,
                                 references=cum_references)
print("\n Cumulative Results : \n",cum_results)

# Text summarization

In [None]:
verbose_text ="""
Earth is the third planet from the Sun and the only astronomical object 
known to harbor life. 
While large volumes of water can be found 
throughout the Solar System, only Earth sustains liquid surface water. 
About 71% of Earth's surface is made up of the ocean, dwarfing 
Earth's polar ice, lakes, and rivers. 
The remaining 29% of Earth's 
surface is land, consisting of continents and islands. 
Earth's surface layer is formed of several slowly moving tectonic plates, 
interacting to produce mountain ranges, volcanoes, and earthquakes. 
Earth's liquid outer core generates the magnetic field that shapes Earth's 
magnetosphere, deflecting destructive solar winds.
"""

verbose_text = verbose_text.replace("\n","")

In [None]:
# selecting summarization pipeline
from transformers import pipeline


extractive_summarizer = pipeline("summarization", 
                                 min_length=10, 
                                 max_length=100)

#Extractive summarization
extractive_summary=extractive_summarizer(verbose_text)

print(extractive_summary[0].get("summary_text"))


In [None]:
print("Checkpoint used: ", extractive_summarizer.model.config)

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

In [None]:
# evaluate with ROUGE (Recall-Oriented Understudy for Gisting Evaluation)
import evaluate

rouge_evaluator = evaluate.load("rouge")

#Evaluate exact match strings
reference_text=["This is the same string"]
predict_text=["This is the same string"]

eval_results=rouge_evaluator.compute(predictions=predict_text, 
                                     references=reference_text)
print("Results for Exact match",eval_results)

In [None]:
#Evaluate no-match strings
reference_text=["This is the different string"]
predict_text=["Google can predict warm weather"]

eval_results=rouge_evaluator.compute(predictions=predict_text, 
                                     references=reference_text)
print("\nResults for no match", eval_results)

In [None]:
#Evaluate summary
eval_results=rouge_evaluator.compute(
    predictions=[extractive_summary[0].get("summary_text")], 
    references=[verbose_text])

print("\nResults for Summary generated", eval_results)

# Natural Language Generation

In [None]:
# cintent creation
from transformers import pipeline

text_generator = pipeline("text-generation", 
                          model="gpt2")
transformers.set_seed(1)

input_text="Natural Language Processing is a \
growing domain in machine learning"

synthetic_text=text_generator(input_text,
                              num_return_sequences=3,
                              max_new_tokens=50)

for text in synthetic_text:
    print(text.get("generated_text") ,"\n-----------------")

# Bot conversation

In [None]:
# chatbot conversation example 
from transformers import  Conversation

conversational_pipeline = pipeline("conversational", 
                                   model="facebook/blenderbot_small-90M")

print(conversational_pipeline.model.config)

In [None]:
#Sample inputs
first_input="Do you have any hobbies?"
second_input = "I like to watch movies"
third_input = "action movies"

#Create a context
bot_conversation = Conversation(first_input)

print("\nFirst Exchange: \n--------------------")

conversational_pipeline(bot_conversation)
print(" User Input:", bot_conversation.past_user_inputs[0])
print(" Bot Output:", bot_conversation.generated_responses[0])

print("\nSecond Exchange: \n--------------------")
bot_conversation.add_user_input(second_input)
conversational_pipeline(bot_conversation)

print(" User Input:", bot_conversation.past_user_inputs[1])
print(" Bot Output:", bot_conversation.generated_responses[1])

print("\nThird Exchange: \n--------------------")
bot_conversation.add_user_input(third_input)
conversational_pipeline(bot_conversation)

print(" User Input:", bot_conversation.past_user_inputs[2])
print(" Bot Output:", bot_conversation.generated_responses[1])

print("\nAccessing All Responses: ")
print(bot_conversation)

# Translation

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

source_english="Acme is a technology company based in New York and Paris"

inputs_german = tokenizer(
    "translate English to German: " + source_english,
    return_tensors="pt",
)
outputs_german = model.generate(
    inputs_german["input_ids"], 
    max_length=40)

print("German Translation: ",
      tokenizer.decode(outputs_german[0], 
                       skip_special_tokens=True))

inputs_french = tokenizer(
    "translate English to French: " + source_english, 
    return_tensors="pt",
)
outputs_french = model.generate(
    inputs_french["input_ids"], 
    max_length=40)

print("French Translation: ", 
      tokenizer.decode(outputs_french[0], 
                       skip_special_tokens=True))

# Further steps

I this notebook I have demonstrated the various pre-built pipelines in Transformers library.

The nex step is to practice the Transformers deeper and to create an AI project using the transfer learning with Hugging Face.