In [1]:
!pip install transformers accelerate
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [3

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset, get_dataset_config_names, Dataset
from wordcloud import WordCloud
import evaluate

In [4]:
df_train = pd.read_csv("train.csv")
df_train.rename(columns={'score': 'label'}, inplace=True)
df_test = pd.read_csv("test.csv")
df_test.rename(columns={'score': 'label'}, inplace=True)
df_test.sample(10)

Unnamed: 0.1,Unnamed: 0,review_id,product_id,reviewer_id,notes,body,title,lang,product,opinion,bodylen,titlelen,comment,label
2104,2104,fr_0850243,product_fr_0873705,reviewer_fr_0785696,1,PRODUIT PAS FINI DE TRES MAUVAISE QUALITE,MAUVAISE QUALITE,fr,home,non satisfied,41,16,mauvaise qualite produit pas fini de tres mauv...,0
690,690,en_0603101,product_en_0330971,reviewer_en_0518437,4,"I love this product, it fits my truck great!!",The custom fit,en,automotive,satisfied,45,14,the custom fit i love this product it fits my ...,2
589,589,en_0522131,product_en_0826040,reviewer_en_0554742,1,"This DVD does not play, I've tried several DVD...","Don't buy from this seller, DVD doesn't work",en,other,non satisfied,213,44,dont buy from this seller dvd doesnt work this...,0
1018,1018,fr_0769458,product_fr_0037159,reviewer_fr_0920770,1,Le produit c’est casser en 2 jour les diamants...,Déçu,fr,jewelry,non satisfied,90,4,déçu le produit c’est casser en 2 jour les dia...,0
1381,1381,fr_0520057,product_fr_0452258,reviewer_fr_0556082,5,"2 bouchons, un seul d'utilisé avec un lien pou...",parfait,fr,camera,satisfied,65,7,parfait 2 bouchons un seul dutilisé avec un li...,2
219,219,fr_0348549,product_fr_0343064,reviewer_fr_0514992,3,Semble solide un peu juste en eclairage par ap...,Éclairage,fr,automotive,mixed,62,9,éclairage semble solide un peu juste en eclair...,1
563,563,fr_0927651,product_fr_0224608,reviewer_fr_0460185,5,Très beau coffret qui réunit les trois films o...,Excellent,fr,other,satisfied,81,9,excellent très beau coffret qui réunit les tro...,2
1,1,en_0815926,product_en_0890680,reviewer_en_0412863,1,Smells pretty good but I won't be using it bec...,Fell apart in shipping,en,grocery,non satisfied,168,22,fell apart in shipping smells pretty good but ...,0
2015,2015,en_0386153,product_en_0438301,reviewer_en_0414333,3,It is soft and a good quality BUT it’s too big...,Runs big,en,home,satisfied,239,8,runs big it is soft and a good quality but it’...,2
1262,1262,fr_0237129,product_fr_0240762,reviewer_fr_0292250,4,"Petit prix , bien emballés et protégés tiennen...",bon achat bonne autonomie des batteries,fr,camera,satisfied,131,39,bon achat bonne autonomie des batteries petit ...,2


In [5]:
train_set = df_train.loc[:, ["comment", "label"]]
train_set = Dataset.from_pandas(train_set)

test_set = df_test.loc[:, ["comment", "label"]]
test_set = Dataset.from_pandas(test_set)
test_set

Dataset({
    features: ['comment', 'label'],
    num_rows: 3000
})

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [7]:
def preprocess_function(element):
    return tokenizer(element["comment"], truncation=True)

In [8]:
tokenized_train_set = train_set.map(preprocess_function, batched=True)
tokenized_test_set = test_set.map(preprocess_function, batched=True)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [10]:
id2label = {0: "non satisfied", 1: "mixed", 2: "satisfied"}
label2id = {"non satisfied": 0, "mixed": 1, "satisfied": 2}

In [11]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_train_set) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [12]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased", num_labels=3, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [13]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_train_set,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_test_set,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 2
batches_per_epoch = len(tokenized_train_set) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [17]:
model.compile(optimizer=optimizer)

In [18]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [19]:
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

push_to_hub_callback = PushToHubCallback(
    output_dir="my_awesome_model",
    tokenizer=tokenizer,
)
callbacks = [metric_callback, push_to_hub_callback]

Cloning https://huggingface.co/Nelver28/my_awesome_model into local empty directory.


In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=2, callbacks=callbacks)

Epoch 1/2
173/750 [=====>........................] - ETA: 2:22:51 - loss: 0.9821