<a href="https://colab.research.google.com/github/Galina-Blokh/distilbert_bynary_classification/blob/main/hugging_face_binary_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports, Instalations and Constants

In [1]:
!pip install transformers

import pandas as pd
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'# This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2.
BATCH_SIZE = 16
N_EPOCHS = 3 # we can put more, because evaluation of the model shows big difference in loss with accuracy 1.0

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 16.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 53.9MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 52.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=ebf87ab09a

## A common data set (with source text, preprocesses text, new features, and labels) before text-to-sequence transformation

We will take a column with not preprocecced text data for pure experiment with Hugging Face distilbert model

In [2]:

train = pd.read_pickle('https://github.com/Galina-Blokh/ai_assignment_aidock/blob/refator/data/new_train_data_clean.pkl?raw=true')
test = pd.read_pickle('https://github.com/Galina-Blokh/ai_assignment_aidock/blob/refator/data/new_test_data_clean.pkl?raw=true')

X_train =train.paragraph
X_test =test.paragraph
y_train = train.label
y_test = test.label 

pd.DataFrame(X_train)[:3]

Unnamed: 0,paragraph
0,[b'Preheat the oven to 400 degrees F. Line a large baking sheet with parchment paper.']
1,"[b'1 (3-inch) piece of kombu 4 cups water 3 tablespoons wakame dried seaweed \xc2\xbc cup white miso paste \xe2\x85\x93 cup chopped scallions 6 ounces silken tofu, cubed tamari, to taste']"
2,[b'Pour the batter into your baking pan and bake for 25-30 minutes or until a toothpick comes out nearly clean.']


## check the shapes and split proportion 

In [3]:
X_train.shape, X_test.shape, y_train.shape

((3898,), (973,), (3898,))

In [4]:
print('The proportion in y_train\n',y_train.value_counts(normalize=True).mul(100))
print('The proportion in y_test\n',y_test.value_counts(normalize=True).mul(100))

The proportion in y_train
 0    79.63058
1    20.36942
Name: label, dtype: float64
The proportion in y_test
 0    79.650565
1    20.349435
Name: label, dtype: float64


## Preprocess

### Decode byte arrays into string representation. 

In [5]:
X_train = X_train.apply(lambda x: str(x[0], 'utf-8'))
X_test = X_test.apply(lambda x:  str(x[0], 'utf-8'))
X_train[:3]

0                                                                                       Preheat the oven to 400 degrees F. Line a large baking sheet with parchment paper.
1    1 (3-inch) piece of kombu 4 cups water 3 tablespoons wakame dried seaweed ¼ cup white miso paste ⅓ cup chopped scallions 6 ounces silken tofu, cubed tamari, to taste
2                                                             Pour the batter into your baking pan and bake for 25-30 minutes or until a toothpick comes out nearly clean.
Name: paragraph, dtype: object

### Max sentence length

In [6]:
MAX_LEN = X_train.apply(lambda s: len([x for x in s.split()])).max()
MAX_LEN

242

## Encode with  DistilBertTokenizer

In [7]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

#tokenize the text
train_encodings = tokenizer(list(X_train.values), max_length=MAX_LEN, truncation=True, padding=True)
test_encodings = tokenizer(list(X_test.values), max_length=MAX_LEN, truncation=True, padding=True)

#print the first paragraph and it transformation
print(f'First paragraph: \'{X_train[:1]}\'')
print(f'Input ids: {train_encodings["input_ids"][0]}')
print(f'Attention mask: {train_encodings["attention_mask"][0]}')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=48.0, style=ProgressStyle(description_w…


First paragraph: '0    Preheat the oven to 400 degrees F. Line a large baking sheet with parchment paper.
Name: paragraph, dtype: object'
Input ids: [101, 3653, 20192, 2102, 1996, 17428, 2000, 4278, 5445, 1042, 1012, 2240, 1037, 2312, 21522, 7123, 2007, 22433, 3259, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [9]:
train_encodings

transformers.tokenization_utils_base.BatchEncoding

###  Turn our labels and encodings into a tf.Dataset object

In [8]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    list(y_train.values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                    list(y_test.values)))

In [9]:
train_dataset

<TensorSliceDataset shapes: ({input_ids: (242,), attention_mask: (242,)}, ()), types: ({input_ids: tf.int32, attention_mask: tf.int32}, tf.int32)>

## Fine-tuning with native TensorFlow


In [10]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) #Computes the crossentropy loss between the labels and predictions.
model.compile(optimizer=optimizerr,
              loss=losss,
              metrics=['accuracy'])

model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE), 
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267949840.0, style=ProgressStyle(descri…




All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f071c0b02a0> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f071c0b02a0> is not a module, class, method, function, traceback, frame, or code object

Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f05e8271a58>

## Model Evaluation

In [12]:
model.evaluate(test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE), return_dict=True, batch_size=BATCH_SIZE)



{'accuracy': 1.0, 'loss': 1.5778896340634674e-05}

## Predict on the different text examples

In [13]:
def predict_proba(text_list, model, tokenizer):
  """
  To get array with predicted probabilities for 0 - instructions, 1- ingredients classes 
  for each paragraph in the list of strings
  :param text_list: list[str]
  :param model: transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification
  :param tokenizer: transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer
  :return res: numpy.ndarray
  """
     
  encodings = tokenizer(text_list, max_length=MAX_LEN, truncation=True, padding=True)
  dataset = tf.data.Dataset.from_tensor_slices((dict(encodings))) 
  preds = model.predict(dataset.batch(1)).logits
  res = tf.nn.softmax(preds, axis=1).numpy()
    
  return res

We take a txt file [here](https://github.com/Galina-Blokh/ai_assignment_aidock/blob/refator/data/test_links.txt). This file contains links to the recipe pages which our model didn't saw yet. Assuming you scraped data from the first [url](https://www.loveandlemons.com/green-bean-salad-recipe/). The data you feed into your model for prediction will be looking like in the cell below. (*A list with one first string of ingredients and following three strings with instructions.)

In [25]:
strings_list =["""
                  1 pound green beans, trimmed
                  ½ head radicchio, sliced into strips
                  Scant ¼ cup thinly sliced red onion
                  Honey Mustard Dressing, for drizzling
                  2 ounces goat cheese
                  2 tablespoons chopped walnuts
                  2 tablespoons sliced almonds
                  ¼ cup tarragon
                  Flaky sea salt
                  """,
                  """
                  Bring a large pot of salted water to a boil and set a bowl of ice water nearby.
                  Drop the green beans into the boiling water and blanch for 2 minutes.
                    Remove the beans and immediately immerse in the ice water long enough 
                    to cool completely, about 15 seconds. Drain and place on paper towels to dry.
                  """,
                  """
                  Transfer the beans to a bowl and toss with the radicchio, onion, 
                  and a few spoonfuls of the dressing.
                  """,
                  """
                  Arrange on a platter and top with small dollops of goat cheese, the walnuts, 
                  almonds, and tarragon. Drizzle with more dressing, season to taste with flaky 
                  salt, and serve.
                  """]
predict_proba(strings_list, model, tokenizer)

array([[1.63417135e-05, 9.99983668e-01],
       [9.99986053e-01, 1.39580325e-05],
       [9.99986053e-01, 1.39833473e-05],
       [9.99988914e-01, 1.11078716e-05]], dtype=float32)

The result of the predictive function gives an array of arrays. Each inner array contains probability for 0 and 1 classes (i.e. for instructions and ingredients labels). We got a pretty accurate model!

Even if you'll do a single paragraph as an input, you'll get a very accurate model's answer (data from [second line in .txt document](https://github.com/Galina-Blokh/ai_assignment_aidock/blob/refator/data/test_links.txt) - recipe page [url](https://www.loveandlemons.com/any-vegetable-vinegar-pickles/))

In [15]:
string1 = ["""
            any vegetables you like (I used cucumbers, broccoli, cauliflower, onions and radishes)
            fresh or dried spices (I used peppercorns, cumin, coriander, mustard seeds, & caraway)
            1 cup any kind of vinegar (I used white wine vinegar)
            1 cup filtered water
            1 tablespoon kosher or any non-iodized salt
            optional: 1 teaspoon sugar
            """]
predict_proba(string1, model, tokenizer)



array([[2.2927354e-05, 9.9997711e-01]], dtype=float32)

In [16]:
string2 = ['Wash and cut up your vegetables and pack them into a clean jar.']

predict_proba(string2, model, tokenizer)



array([[9.9998724e-01, 1.2748814e-05]], dtype=float32)

In [17]:
string3 = ['Add between ¼ - ½ teaspoon of whole dried spices.']

predict_proba(string3, model, tokenizer)

array([[9.9998546e-01, 1.4511723e-05]], dtype=float32)

In [18]:
string4 = ['Combine vinegar, filtered water and salt in a medium saucepan and bring to a boil.']

predict_proba(string4, model, tokenizer)

array([[9.9998701e-01, 1.3023363e-05]], dtype=float32)

In [19]:
string5 = ['Put your just boiled brine over the vegetables in the jar.']

predict_proba(string5, model, tokenizer)

array([[9.9998689e-01, 1.3064862e-05]], dtype=float32)

In [20]:
string6 = ['Wipe any vinegar spills from the rim with a clean towel and put on the lid.']

predict_proba(string6, model, tokenizer)

array([[9.9998605e-01, 1.3964517e-05]], dtype=float32)

In [21]:
string7 = ['Hide the jar in the back of the friedge for at least a week. Two weeks is better, three is best.']

predict_proba(string7, model, tokenizer)

array([[9.9998832e-01, 1.1716199e-05]], dtype=float32)

In [22]:
string8 = ['Keep them in the fridge for up to 6 months.']

predict_proba(string8, model, tokenizer)

array([[9.9998403e-01, 1.5982067e-05]], dtype=float32)

## Well, now you know all steps of how to fine-tune the Hugging Face DistilBert model with Tensorflow API

## The end