# Importing Libraries

In [1]:
from transformers import pipeline
import torch
import torch.nn.functional as F


# Using Hugging face's ready-to-use pipeline

In [2]:
classifier = pipeline('sentiment-analysis')
result = classifier(["Darghouthi is an awesome developer",
                     "I hate the fact that I love you"])
result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998455047607422},
 {'label': 'NEGATIVE', 'score': 0.9907469749450684}]

# Using a BERT transformer fine-tuned for english

In [3]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
bertClassifier = pipeline("sentiment-analysis", model=model_name)
results = bertClassifier(["Darghouthi is an awesome developer",
                          "I hate the fact that I love you"])
results

[{'label': 'POSITIVE', 'score': 0.9998455047607422},
 {'label': 'NEGATIVE', 'score': 0.9907469749450684}]

# Adjusting the tokenizer and Classifier

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
bertAutoClassifier = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
autoClassifier = pipeline("sentiment-analysis", model = bertAutoClassifier, tokenizer= tokenizer)
results = autoClassifier(["Darghouthi is an awesome developer",
                          "I hate the fact that I love you"])
results                                       

[{'label': 'POSITIVE', 'score': 0.9998455047607422},
 {'label': 'NEGATIVE', 'score': 0.9907469749450684}]

In [5]:
tokens = tokenizer.tokenize("Darghouthi is an awesome developer")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("Darghouthi is an awesome developer")
print(f"tokens: {tokens}")
print(f"token_ids: {token_ids}")
print(f"input_ids: {input_ids}")

tokens: ['dar', '##gh', '##outh', '##i', 'is', 'an', 'awesome', 'developer']
token_ids: [18243, 5603, 17167, 2072, 2003, 2019, 12476, 9722]
input_ids: {'input_ids': [101, 18243, 5603, 17167, 2072, 2003, 2019, 12476, 9722, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# Fine tuning the Classifier on a custom Dataset

In [6]:
X_train = ["Darghouthi is an awesome developer",
           "I hate the fact that I love you",
           "the food here is a delicacy"]
batch = tokenizer(X_train, padding= True, truncation=True, max_length=256, return_tensors="pt")
with torch.no_grad():
    output = bertAutoClassifier(**batch)
    print(output)
    predictions = F.softmax(output.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1)
    print(labels)
    labels = [bertAutoClassifier.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-4.2225,  4.5533],
        [ 2.5521, -2.1214],
        [-1.9875,  2.0248]]), hidden_states=None, attentions=None)
tensor([[1.5441e-04, 9.9985e-01],
        [9.9075e-01, 9.2530e-03],
        [1.7770e-02, 9.8223e-01]])
tensor([1, 0, 1])
['POSITIVE', 'NEGATIVE', 'POSITIVE']


# Sentiment classification on German sentences

In [7]:
geman_model_name = "oliverguhr/german-sentiment-bert"

tokenizer = AutoTokenizer.from_pretrained(geman_model_name)
german_model = AutoModelForSequenceClassification.from_pretrained(geman_model_name)

german_sentences = ["Passau ist eine sehr schöne Stadt.", "die Pizza schmeckt nicht gut", "Deutschland hat die perfekte Work-Life-Balance"]
german_batch = tokenizer(german_sentences, padding= True, truncation=True, max_length=256, return_tensors="pt")
with torch.no_grad():
    output = german_model(**german_batch)
    print(output)
    predictions = F.softmax(output.logits, dim=1)
    print(predictions)
    labels = torch.argmax(predictions, dim=1)
    print(labels)
    labels = [german_model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

Downloading:   0%|          | 0.00/161 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.5039, -0.4604, -3.4561],
        [ 0.2645,  3.1876, -5.1879],
        [ 0.1335, -0.8306,  1.3824]]), hidden_states=None, attentions=None)
tensor([[9.8046e-01, 1.8611e-02, 9.3052e-04],
        [5.1010e-02, 9.4877e-01, 2.1864e-04],
        [2.0543e-01, 7.8338e-02, 7.1623e-01]])
tensor([0, 1, 2])
['positive', 'negative', 'neutral']


# IMDB reviews Sentiment analysis

## 1- Imports

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


## 2- Data pre-processsing

In [9]:

imdb = load_dataset("imdb")
imdb["test"][0]



tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_imdb = imdb.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /home/darghouthi/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /home/darghouthi/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]



  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

## 3- Training

In [15]:
tf_train_set = tokenized_imdb["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

tf_validation_set = tokenized_imdb["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=1,
    collate_fn=data_collator,
)
from transformers import create_optimizer
import tensorflow as tf

batch_size = 1
num_epochs = 2
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.compile(optimizer=optimizer)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_119', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f456c04b820>

In [43]:
predictions = model.predict(tf_validation_set)



In [44]:
predictions


TFSequenceClassifierOutput(loss=array([0.00117674, 0.01438309, 0.0067858 , ..., 0.7950314 , 0.05493376,
       0.01522068], dtype=float32), logits=array([[ 3.326938  , -3.4174528 ],
       [ 2.1119304 , -2.12257   ],
       [ 2.4903588 , -2.4991648 ],
       ...,
       [ 0.06064893, -0.13369226],
       [-1.4517492 ,  1.4222863 ],
       [-2.1142852 ,  2.0631912 ]], dtype=float32), hidden_states=None, attentions=None)

In [45]:
import numpy as np

preds = np.argmax(predictions.logits, axis=-1)
preds

array([0, 0, 0, ..., 0, 1, 1])

In [46]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions=preds, references=imdb["test"]["label"])

{'accuracy': 0.93368, 'f1': 0.933424349502088}