## Huggingface Playground
Pipelines, Models, Tokenizer
Model Hub & Fine-tuning

In [1]:
!pip install torch
!pip install transformers



In [2]:
from transformers import pipeline

Sentiment Analysis

In [3]:
classifier = pipeline("sentiment-analysis")
sentiment = classifier("I've eagerly anticipated the opportunity to explore a HuggingFace exercise notebook for as long as I can remember. I'm thrilled that I've finally had the chance to experience it.")
print(sentiment)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9993579983711243}]


# Text Generation
GPT2 Model (Default)

In [4]:
generator = pipeline("text-generation")

sentences = generator(
    "I just finished to bake a pizza, for the",
    max_length = 30,
    num_return_sequences = 3,
)

print(sentences)

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'I just finished to bake a pizza, for the umpteenth time," said Michael Fiergill, 39, who owns the New York pizz'}, {'generated_text': 'I just finished to bake a pizza, for the first time today.\nI love making pizza. I love it that much. I love it when'}, {'generated_text': 'I just finished to bake a pizza, for the price I was hoping someone would help. It turned out it really did.\n\nI started by'}]


#Tokenizer

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

sentence = "I've eagerly anticipated the opportunity to explore a HuggingFace exercise notebook for as long as I can remember. I'm thrilled that I've finally had the chance to experience it."


# Specify the model name. Here, I am using 'distilbert-base-uncased-finetuned-sst-2-english',
# which is a version of DistilBERT that is fine-tuned for sentiment analysis (Default Model).
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

# Load the pre-trained model for sequence classification from Hugging Face
# Using the from_pretrained function
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Load the tokenizer corresponding to the model.
# This tokenizer will correctly process the input sentence for the model.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a pipeline for sentiment analysis. This pipeline will handle the process of
# tokenizing the input sentence, feeding it into the model, and interpreting the output.
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Use the classifier to analyze the sentiment of the sentence.
res = classifier(sentence)
print(res)

[{'label': 'POSITIVE', 'score': 0.9993579983711243}]


 #Hugging Face Transformers library along with PyTorch for sentiment analysis

In [6]:
import torch
import torch.nn.functional as F


In [7]:



# Specify the model name. Here, I am using 'distilbert-base-uncased-finetuned-sst-2-english',
# which is a version of DistilBERT that is fine-tuned for sentiment analysis (Default Model).
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

X_train = ["I am very happy today", "The moon is beautiful today"]

res = classifier(X_train)

print(res)

[{'label': 'POSITIVE', 'score': 0.9998797178268433}, {'label': 'POSITIVE', 'score': 0.9998810291290283}]


In [8]:
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512,return_tensors="pt")
print(batch)

with torch.no_grad():
  outputs = model(**batch)
  print(outputs)
  predictions = F.softmax(outputs.logits, dim=1)
  print(predictions)
  labels = torch.argmax(outputs.logits, dim=1)
  print(labels)



{'input_ids': tensor([[ 101, 1045, 2572, 2200, 3407, 2651,  102],
        [ 101, 1996, 4231, 2003, 3376, 2651,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-4.3394,  4.6865],
        [-4.3431,  4.6939]]), hidden_states=None, attentions=None)
tensor([[1.2024e-04, 9.9988e-01],
        [1.1891e-04, 9.9988e-01]])
tensor([1, 1])


 #Hugging Face Transformers library along with PyTorch for Text-Generation (Language Modeling)

In [9]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Text generation setup
prompt = "Once upon a time"
max_length = 100

# Encode context the generation is conditioned on
input_ids = tokenizer.encode(prompt, return_tensors='pt')
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # Create an attention mask

# Generate text
with torch.no_grad():
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id  # Explicitly set the pad token ID
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)


Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
dataset = load_dataset("emotion")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)
# Tokenize the dataset using the tokenizer using BertTokenizer.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets['train'].shuffle(seed=50).select(range(1000))  # Reduce dataset size for my PlayGround
eval_dataset = tokenized_datasets['validation'].shuffle(seed=50).select(range(500))

# Load a pre-trained BERT model, which I will fine-tune.

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)  # 6 labels in the 'emotion' dataset


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Set up the training configurations.

training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model checkpoints
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=32,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch
)

#Create a Trainer to handle the training.
trainer = Trainer(
    model=model,                         # The instantiated Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=eval_dataset            # Evaluation dataset
)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
