<a href="https://colab.research.google.com/github/MNourMoslem/Simple-Text-Classifier/blob/master/TextClassifier_HuggingFace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install The Libraries

In [None]:
!pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m

## Log To HuggingFace

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import The Model, The Tokenizer and The IMDB Dataset


In [None]:

id2label = {0 : "Negative", 1 : "Positive"}
label2id = {"Negative" : 0, "Positive" : 1}

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

checkpoint = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2, id2label = id2label, label2id = label2id)

imdb = load_dataset("imdb", split = 'train+test')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prepocess The Data

In [None]:
imdb = imdb.train_test_split(test_size = 0.2)

def preprocess_data(example):
  return tokenizer(example['text'], truncation = True)

t_imdb = imdb.map(preprocess_data, batched = True)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

## Import Accuracy Matrix

In [None]:
import evaluate

accuracy = evaluate.load('accuracy')

In [None]:
import numpy as np

def compute_metrics(result):
  preds, labels = result
  preds = np.argmax(preds, axis = 1)
  return accuracy.compute(predictions = preds, references = labels)

## Import Data Collator

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

## Set and Implement Trainer and TrainingArguments

In [None]:
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
  output_dir = "textclassifier-distilbert-imdb",
  overwrite_output_dir=True,
  learning_rate = 1e-4,
  num_train_epochs=2,
  per_device_train_batch_size=16,
  per_device_eval_batch_size=16,
  weight_decay = 0.1,
  eval_strategy='epoch',
  save_strategy='epoch',
  load_best_model_at_end=True,
  push_to_hub = True
)

trainer = Trainer(
  model = model,
  args = train_args,
  tokenizer = tokenizer,
  train_dataset = t_imdb["train"],
  eval_dataset = t_imdb["test"],
  data_collator = data_collator,
  compute_metrics = compute_metrics
)

## Train The Model and Push to The Hub

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2167,0.190629,0.9277
2,0.1013,0.241202,0.9337


TrainOutput(global_step=5000, training_loss=0.19267502212524415, metrics={'train_runtime': 3963.1065, 'train_samples_per_second': 20.186, 'train_steps_per_second': 1.262, 'total_flos': 1.0471696889580288e+16, 'train_loss': 0.19267502212524415, 'epoch': 2.0})

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/MNMoslem/textclassifier-distilbert-imdb/commit/7a5c792fbb4896fd525e8e33b7a39fad4f8a186c', commit_message='End of training', commit_description='', oid='7a5c792fbb4896fd525e8e33b7a39fad4f8a186c', pr_url=None, pr_revision=None, pr_num=None)

##Test The Result

In [None]:
from transformers import pipeline

model_path = "MNMoslem/textclassifier-distilbert-imdb" # your own model path
pipe = pipeline('text-classification', model = model_path)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
text = ['I love this movie', 'i hate your movie']

pipe(text)

[{'label': 'Positive', 'score': 0.9968656897544861},
 {'label': 'Negative', 'score': 0.9947574138641357}]