In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from datasets import load_dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

In [3]:
def convert_labels(example):
    example["label"] = 1 if example["label"] > 2 else 0
    return example

In [4]:

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", model_max_length=256)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)
device = 0 if torch.cuda.is_available() else "cpu"

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
ds = load_dataset("yelp_review_full", split="train")
ds = ds.filter(lambda x: x["label"] != 2, batched=False)
ds = ds.filter(lambda x: len(x["text"]) > 50, batched=False)
ds = ds.filter(lambda x: len(x["text"]) < 150, batched=False)
ds = ds.map(convert_labels)

val = load_dataset("yelp_review_full", split="test")
val = val.filter(lambda x: x["label"] != 2, batched=False)
val = val.filter(lambda x: len(x["text"]) > 50, batched=False)
val = val.filter(lambda x: len(x["text"]) < 150, batched=False)
val = val.map(convert_labels)

Filter: 100%|██████████| 520000/520000 [00:07<00:00, 73638.44 examples/s]
Filter: 100%|██████████| 512324/512324 [00:06<00:00, 82867.27 examples/s]
Map: 100%|██████████| 43983/43983 [00:04<00:00, 10633.40 examples/s]
Filter: 100%|██████████| 40000/40000 [00:00<00:00, 68371.22 examples/s]
Filter: 100%|██████████| 39419/39419 [00:00<00:00, 91665.78 examples/s]
Map: 100%|██████████| 3366/3366 [00:00<00:00, 11035.30 examples/s]


In [6]:
def tokenization(example):
    return tokenizer(example["text"], truncation=True, max_length=256)

train_tokenized = ds.map(tokenization, batched=True)
val_tokenized = val.map(tokenization, batched=True)
train_tokenized.set_format(type="torch", columns=['label', 'text', 'input_ids', 'attention_mask'])
val_tokenized.set_format(type="torch", columns=['label', 'text', 'input_ids', 'attention_mask'])

Map:   0%|          | 0/43983 [00:00<?, ? examples/s]

Map: 100%|██████████| 43983/43983 [00:02<00:00, 19828.54 examples/s]
Map: 100%|██████████| 3366/3366 [00:00<00:00, 20905.47 examples/s]


In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="../results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    logging_dir="../results/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjungliana[0m ([33mpiksle[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/13745 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  4%|▎         | 502/13745 [00:43<18:35, 11.87it/s]

{'loss': 0.2295, 'learning_rate': 4.81811567842852e-05, 'epoch': 0.18}


  7%|▋         | 1002/13745 [01:24<17:28, 12.15it/s]

{'loss': 0.1938, 'learning_rate': 4.6362313568570395e-05, 'epoch': 0.36}


 11%|█         | 1502/13745 [02:06<17:29, 11.67it/s]

{'loss': 0.1683, 'learning_rate': 4.4543470352855585e-05, 'epoch': 0.55}


 15%|█▍        | 2002/13745 [02:47<16:45, 11.67it/s]

{'loss': 0.1638, 'learning_rate': 4.272462713714078e-05, 'epoch': 0.73}


 18%|█▊        | 2502/13745 [03:29<16:00, 11.70it/s]

{'loss': 0.1589, 'learning_rate': 4.090578392142598e-05, 'epoch': 0.91}


                                                    
 20%|██        | 2749/13745 [03:53<15:26, 11.86it/s]

{'eval_loss': 0.15708579123020172, 'eval_runtime': 4.1953, 'eval_samples_per_second': 802.324, 'eval_steps_per_second': 50.294, 'epoch': 1.0}


 22%|██▏       | 3002/13745 [04:23<15:32, 11.53it/s]  

{'loss': 0.1234, 'learning_rate': 3.908694070571117e-05, 'epoch': 1.09}


 25%|██▌       | 3502/13745 [05:04<14:02, 12.15it/s]

{'loss': 0.0897, 'learning_rate': 3.7268097489996364e-05, 'epoch': 1.27}


 29%|██▉       | 4002/13745 [05:46<13:34, 11.96it/s]

{'loss': 0.0987, 'learning_rate': 3.5449254274281554e-05, 'epoch': 1.46}


 33%|███▎      | 4502/13745 [06:27<12:24, 12.41it/s]

{'loss': 0.0928, 'learning_rate': 3.363041105856675e-05, 'epoch': 1.64}


 36%|███▋      | 5002/13745 [07:09<12:08, 12.01it/s]

{'loss': 0.0954, 'learning_rate': 3.181156784285195e-05, 'epoch': 1.82}


                                                    
 40%|████      | 5498/13745 [07:54<11:07, 12.35it/s]

{'eval_loss': 0.19474577903747559, 'eval_runtime': 4.1823, 'eval_samples_per_second': 804.818, 'eval_steps_per_second': 50.451, 'epoch': 2.0}


 40%|████      | 5501/13745 [07:57<1:57:44,  1.17it/s]

{'loss': 0.0972, 'learning_rate': 2.9992724627137144e-05, 'epoch': 2.0}


 44%|████▎     | 6001/13745 [08:38<10:58, 11.76it/s]  

{'loss': 0.0484, 'learning_rate': 2.8173881411422337e-05, 'epoch': 2.18}


 47%|████▋     | 6501/13745 [09:20<10:22, 11.63it/s]

{'loss': 0.0389, 'learning_rate': 2.6355038195707533e-05, 'epoch': 2.36}


 51%|█████     | 7001/13745 [10:01<09:16, 12.11it/s]

{'loss': 0.0471, 'learning_rate': 2.4536194979992726e-05, 'epoch': 2.55}


 55%|█████▍    | 7501/13745 [10:43<08:55, 11.66it/s]

{'loss': 0.0486, 'learning_rate': 2.271735176427792e-05, 'epoch': 2.73}


 58%|█████▊    | 8001/13745 [11:24<08:06, 11.80it/s]

{'loss': 0.0469, 'learning_rate': 2.0898508548563116e-05, 'epoch': 2.91}


                                                    
 60%|██████    | 8247/13745 [11:49<07:46, 11.79it/s]

{'eval_loss': 0.19460858404636383, 'eval_runtime': 4.2093, 'eval_samples_per_second': 799.655, 'eval_steps_per_second': 50.127, 'epoch': 3.0}


 62%|██████▏   | 8502/13745 [12:25<07:27, 11.72it/s]  

{'loss': 0.0322, 'learning_rate': 1.907966533284831e-05, 'epoch': 3.09}


 65%|██████▌   | 9002/13745 [13:06<06:56, 11.39it/s]

{'loss': 0.0181, 'learning_rate': 1.7260822117133506e-05, 'epoch': 3.27}


 69%|██████▉   | 9502/13745 [13:48<05:55, 11.92it/s]

{'loss': 0.019, 'learning_rate': 1.54419789014187e-05, 'epoch': 3.46}


 73%|███████▎  | 10002/13745 [14:29<05:28, 11.39it/s]

{'loss': 0.0264, 'learning_rate': 1.3623135685703892e-05, 'epoch': 3.64}


 76%|███████▋  | 10502/13745 [15:11<04:30, 12.00it/s]

{'loss': 0.021, 'learning_rate': 1.1804292469989088e-05, 'epoch': 3.82}


                                                     
 80%|████████  | 10996/13745 [15:56<03:46, 12.11it/s]

{'eval_loss': 0.24686968326568604, 'eval_runtime': 4.2303, 'eval_samples_per_second': 795.685, 'eval_steps_per_second': 49.878, 'epoch': 4.0}


 80%|████████  | 11001/13745 [16:09<1:05:14,  1.43s/it]

{'loss': 0.0203, 'learning_rate': 9.985449254274281e-06, 'epoch': 4.0}


 84%|████████▎ | 11501/13745 [16:50<03:04, 12.16it/s]  

{'loss': 0.0062, 'learning_rate': 8.166606038559476e-06, 'epoch': 4.18}


 87%|████████▋ | 12001/13745 [17:32<02:27, 11.82it/s]

{'loss': 0.0119, 'learning_rate': 6.347762822844672e-06, 'epoch': 4.37}


 91%|█████████ | 12501/13745 [18:13<01:47, 11.57it/s]

{'loss': 0.0091, 'learning_rate': 4.528919607129865e-06, 'epoch': 4.55}


 95%|█████████▍| 13001/13745 [18:55<01:01, 12.14it/s]

{'loss': 0.0097, 'learning_rate': 2.71007639141506e-06, 'epoch': 4.73}


 98%|█████████▊| 13501/13745 [19:37<00:20, 11.81it/s]

{'loss': 0.0056, 'learning_rate': 8.912331757002547e-07, 'epoch': 4.91}


                                                     
100%|██████████| 13745/13745 [20:01<00:00, 12.55it/s]

{'eval_loss': 0.29075872898101807, 'eval_runtime': 4.1983, 'eval_samples_per_second': 801.75, 'eval_steps_per_second': 50.258, 'epoch': 5.0}


100%|██████████| 13745/13745 [20:16<00:00, 11.30it/s]

{'train_runtime': 1219.7128, 'train_samples_per_second': 180.301, 'train_steps_per_second': 11.269, 'train_loss': 0.07007470710271486, 'epoch': 5.0}





TrainOutput(global_step=13745, training_loss=0.07007470710271486, metrics={'train_runtime': 1219.7128, 'train_samples_per_second': 180.301, 'train_steps_per_second': 11.269, 'train_loss': 0.07007470710271486, 'epoch': 5.0})

In [8]:
# Save the fine-tuned model
model.save_pretrained("../models/yelpBERT")

In [9]:
results = trainer.evaluate()

# Print evaluation results
print(results)

  0%|          | 0/211 [00:00<?, ?it/s]

100%|██████████| 211/211 [00:04<00:00, 48.54it/s]

{'eval_loss': 0.29075872898101807, 'eval_runtime': 4.3713, 'eval_samples_per_second': 770.018, 'eval_steps_per_second': 48.269, 'epoch': 5.0}





In [10]:
text = "Nice atmosphere, I will go there for my birthday."
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
classifier(text)


[{'label': 'POSITIVE', 'score': 0.9997957348823547}]

In [11]:
text = "Not really a good place to spend your evening"
classifier(text)

[{'label': 'NEGATIVE', 'score': 0.9999626874923706}]

In [12]:
text = "I ate a hard roll today"
classifier(text)

[{'label': 'NEGATIVE', 'score': 0.9995110034942627}]