In [1]:
!pip install transformers
!pip install datasets
!pip install torchmetrics
!pip install gradio

import gradio
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import torchmetrics
from torchmetrics.classification.f_beta import F1Score


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 29.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 52.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 66.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 

In [3]:
# import the dataset into a dataset class
dataset = load_dataset("csv", data_files="dataset_balanced.csv", split="train")

# summarize dataset
size = len(dataset["text"])
print(f"Found {size} examples to train on")
print("Schema:")
print(dataset)



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ec2a0913075edf17/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ec2a0913075edf17/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.
Found 1358 examples to train on
Schema:
Dataset({
    features: ['text', 'labels'],
    num_rows: 1358
})


In [4]:

split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

train_ds = split_dataset["train"]
test_ds = split_dataset["test"]

max_length = 100

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

def tokenize_function(examples):
  return tokenizer(examples["text"], padding='max_length', truncation=True, max_length=max_length)

train_ds = train_ds.map(tokenize_function)
test_ds = test_ds.map(tokenize_function)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

  0%|          | 0/1086 [00:00<?, ?ex/s]

  0%|          | 0/272 [00:00<?, ?ex/s]

In [5]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased",
                                                            num_labels=2)

# set up the training args (where to keep checkpoints + when to evaluate)
training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="steps",
                                  eval_steps=100,
                                  num_train_epochs=30,
                                  logging_steps=100,
                                  learning_rate=5e-7)

#
#define metrics to compute on each epoch
f1 = torchmetrics.classification.BinaryF1Score()
acc = torchmetrics.classification.BinaryAccuracy()
def compute_metrics(eval_pred):
  f1.reset()
  acc.reset()
  logits, labels = eval_pred
  logits = torch.tensor(logits)
  labels = torch.tensor(labels)
  acc(logits, F.one_hot(labels, num_classes=2))
  f1(logits, F.one_hot(labels, num_classes=2))
  return {"Accuracy": acc.compute(), "F1 score": f1.compute() }

# trainer class that handles the model training loops
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

In [6]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1086
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4080
  Number of trainable parameters = 65783042


Step,Training Loss,Validation Loss,Accuracy,F1 score
100,0.7166,0.702321,0.5,0.0
200,0.7038,0.695022,0.5,0.0
300,0.6953,0.689401,0.5,0.0
400,0.6832,0.683685,0.5,0.0
500,0.6785,0.678204,0.5,0.0
600,0.6656,0.672039,0.501838,0.007326
700,0.6578,0.66427,0.547794,0.201299
800,0.6512,0.653992,0.597426,0.386555
900,0.6403,0.638942,0.619485,0.486352
1000,0.6226,0.622028,0.641544,0.553776


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 272
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 272
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 272


TrainOutput(global_step=4080, training_loss=0.5094501504711076, metrics={'train_runtime': 424.2563, 'train_samples_per_second': 76.793, 'train_steps_per_second': 9.617, 'total_flos': 842927314104000.0, 'train_loss': 0.5094501504711076, 'epoch': 30.0})

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
max_length = 512

def predict(inputs):
  tokenized_input = tokenizer(inputs,
                              padding='max_length',
                              truncation=True,
                              max_length=max_length, return_tensors="pt")
  #print(tokenized_input)
  preds = model(input_ids=tokenized_input["input_ids"].to("cuda"), attention_mask=tokenized_input["attention_mask"].to("cuda"))
  pred = torch.argmax(preds["logits"], dim=-1)
  print(preds["logits"])
  return "Conspiracy" if pred == 1 else "Not Conspiracy"
  print(preds)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_

In [8]:
predict("Flashback: There is no one in the White House tasked specifically to oversee a coordinated government-wide response in the event of a pandemic, since the post of senior director for global health security and biothreats on the NSC was eliminated last May. https://t.co/kOQK8XVI9j")

tensor([[ 0.9652, -1.3960]], device='cuda:0', grad_fn=<AddmmBackward0>)


'Not Conspiracy'

In [9]:
import gradio as gr

demo = gr.Interface(fn=predict, inputs="text", outputs="text")

demo.launch()   

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`

Using Embedded Colab Mode (NEW). If you have issues, please use share=True and file an issue at https://github.com/gradio-app/gradio/
Note: opening the browser inspector may crash Embedded Colab Mode.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

(<gradio.routes.App at 0x7fd961cb6f50>, 'http://127.0.0.1:7860/', None)