## Dataset

In [1]:
from google.colab import files
CS_file = files.upload()

Saving CovidSentimentData.csv to CovidSentimentData.csv


In [2]:
import io
import pandas as pd

data = pd.read_csv(io.BytesIO(CS_file['CovidSentimentData.csv']))

In [3]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data, random_state=42, test_size=0.2)

## Model

In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 70.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 627 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 81.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting 

In [5]:
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification

model_name = "bert-base-cased"

config = BertConfig.from_pretrained(
    model_name,
    num_labels=2,
)
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-cased",
    do_lower_case=True,
)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    config=config,
)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

The model has the following structure. It uses a combination of word, positional and token *embeddings* to create a sequence representation, then passes the data through 12 *transformer encoders* and finally uses a *linear classifier* to produce the final label.
As the model is already pre-trained and we only plan to fine-tune few upper layers, we want to freeze all layers, except for the last encoder and above (`BertPooler` and `Classifier`).

In [6]:
trainable_layers = [model.bert.encoder.layer[-1], model.bert.pooler, model.classifier]
total_params = 0
trainable_params = 0

for p in model.parameters():
        p.requires_grad = False
        total_params += p.numel()

for layer in trainable_layers:
    for p in layer.parameters():
        p.requires_grad = True
        trainable_params += p.numel()

print(f"Total parameters count: {total_params}") # ~108M
print(f"Trainable parameters count: {trainable_params}") # ~7M

Total parameters count: 108311810
Trainable parameters count: 7680002


## Prepare the data

In [7]:
LABEL_LIST = [0,1]
MAX_SEQ_LENGHT = 128

import torch
import transformers
from torch.utils.data import TensorDataset
from transformers.data.processors.utils import InputExample
from transformers.data.processors.glue import glue_convert_examples_to_features


def _create_examples(df, set_type):
    """ Convert raw dataframe to a list of InputExample. Filter malformed examples
    """
    examples = []
    for index, row in df.iterrows():
        if row['target'] not in LABEL_LIST:
            continue
        if not isinstance(row['Tweet'], str):
            continue
            
        guid = f"{index}-{set_type}"
        examples.append(
            InputExample(guid=guid, text_a=row['Tweet'], label=row['target']))
    return examples

def _df_to_features(df, set_type):
    """ Pre-process text. This method will:
    1) tokenize inputs
    2) cut or pad each sequence to MAX_SEQ_LENGHT
    3) convert tokens into ids
    
    The output will contain:
    `input_ids` - padded token ids sequence
    `attention mask` - mask indicating padded tokens
    `token_type_ids` - mask indicating the split between premise and hypothesis
    `label` - label
    """
    examples = _create_examples(df, set_type)
    
    #backward compatibility with older transformers versions
    legacy_kwards = {}
    from packaging import version
    if version.parse(transformers.__version__) < version.parse("2.9.0"):
        legacy_kwards = {
            "pad_on_left": False,
            "pad_token": tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            "pad_token_segment_id": 0,
        }
    
    return glue_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        label_list=LABEL_LIST,
        max_length=MAX_SEQ_LENGHT,
        output_mode="classification",
        **legacy_kwards,
    )

def _features_to_dataset(features):
    """ Convert features from `_df_to_features` into a single dataset
    """
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor(
        [f.attention_mask for f in features], dtype=torch.long
    )
    all_token_type_ids = torch.tensor(
        [f.token_type_ids for f in features], dtype=torch.long
    )
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    dataset = TensorDataset(
        all_input_ids, all_attention_mask, all_token_type_ids, all_labels
    )

    return dataset

train_features = _df_to_features(df_train, "train")
test_features = _df_to_features(df_test, "test")

train_dataset = _features_to_dataset(train_features)
test_dataset = _features_to_dataset(test_features)



In [8]:
BATCH_SIZE = 4
VIRTUAL_BATCH_SIZE = 32
assert VIRTUAL_BATCH_SIZE % BATCH_SIZE == 0 # VIRTUAL_BATCH_SIZE should be divisible by BATCH_SIZE
N_ACCUMULATION_STEPS = int(VIRTUAL_BATCH_SIZE / BATCH_SIZE)

In [9]:
!pip install opacus==0.15.0

Collecting opacus==0.15.0
  Downloading opacus-0.15.0-py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 28.4 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 23.5 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 11.4 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 9.1 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 5.2 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 5.7 MB/s eta 0:00:01[K     |██████████████████▎             | 71 kB 5.5 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 6.2 MB/s eta 0:00:01[K     |███████████████████████▌        | 92 kB 4.7 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 5.1 MB/s eta 0:00:01[K     |███████████████████████████████▎| 122 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 5.1 MB/s 
Install

In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from opacus.utils.uniform_sampler import UniformWithReplacementSampler

SAMPLE_RATE = BATCH_SIZE / len(train_dataset)

train_sampler=UniformWithReplacementSampler(
    num_samples=len(train_dataset),
    sample_rate=SAMPLE_RATE,
)
train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

## Training

In [11]:
import torch

# Move the model to appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set the model to train mode (HuggingFace models load in eval mode)
model = model.train()

# Define optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, eps=1e-8)
#lambda1 = lambda epoch: 0.65 ** epoch
#scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda1)

In [12]:
EPOCHS = 3
LOGGING_INTERVAL = 100 # once every how many steps we run evaluation cycle and report metrics
EPSILON = 5.5
DELTA = 1 / len(train_dataloader) # Parameter for privacy accounting. Probability of not achieving privacy guarantees

In [13]:
import numpy as np
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm

def accuracy(preds, labels):
    return (preds == labels).mean()

# define evaluation cycle
def evaluate(model):    
    model.eval()

    loss_arr = []
    pred_arr = []
    label_arr = []
    accuracy_arr = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            loss = outputs[0]
            logits = torch.abs(outputs[1])
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = inputs['labels'].detach().cpu().numpy()
            
            loss_arr.append(loss.item())
            accuracy_arr.append(accuracy(preds, labels))

    model.train()
    
    return np.mean(loss_arr), np.mean(accuracy_arr)

In [14]:
from opacus import PrivacyEngine

MAX_GRAD_NORM = 0.1

privacy_engine = PrivacyEngine(
    module=model,
    sample_rate=SAMPLE_RATE * N_ACCUMULATION_STEPS,
    target_delta = DELTA,
    target_epsilon = EPSILON, 
    epochs = EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
)
privacy_engine.attach(optimizer)

  "A ``sample_rate`` has been provided."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


In [15]:
for epoch in range(1, EPOCHS+1):

    torch.manual_seed(19)
    torch.cuda.manual_seed_all(19)

    losses = []
    model.train()

    for step, batch in enumerate(tqdm(train_dataloader)):
        
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':         batch[3]}

        outputs = model(**inputs) # output = loss, logits, hidden_states, attentions

        loss = outputs[0]
        loss.backward()
        
        losses.append(loss.item())

        # We process small batches of size BATCH_SIZE, 
        # until they're accumulated to a batch of size VIRTUAL_BATCH_SIZE.
        # Only then we make a real `.step()` and update model weights
        if (step + 1) % N_ACCUMULATION_STEPS == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
        else:
            optimizer.virtual_step()

        if step > 0 and step % LOGGING_INTERVAL == 0:
            train_loss = np.mean(losses)
            eps, alpha = optimizer.privacy_engine.get_privacy_spent(DELTA)

            eval_loss, eval_accuracy = evaluate(model)

            print(
                f"Epoch: {epoch} | "
                f"Step: {step} | "
                f"Train loss: {train_loss:.3f} | "
                f"Eval loss: {eval_loss:.3f} | "
                f"Eval accuracy: {eval_accuracy:.3f} | "
                f"ɛ: {eps:.2f} (α: {alpha})"
            )

  0%|          | 0/3824 [00:00<?, ?it/s]



Epoch: 1 | Step: 100 | Train loss: 0.885 | Eval loss: 1.565 | Eval accuracy: 0.828 | ɛ: 2.95 (α: 3.3)
Epoch: 1 | Step: 200 | Train loss: 1.289 | Eval loss: 1.728 | Eval accuracy: 0.828 | ɛ: 3.14 (α: 3.2)
Epoch: 1 | Step: 300 | Train loss: 1.508 | Eval loss: 1.346 | Eval accuracy: 0.828 | ɛ: 3.26 (α: 3.1)
Epoch: 1 | Step: 400 | Train loss: 1.437 | Eval loss: 1.199 | Eval accuracy: 0.828 | ɛ: 3.35 (α: 3.1)
Epoch: 1 | Step: 500 | Train loss: 1.421 | Eval loss: 1.323 | Eval accuracy: 0.828 | ɛ: 3.44 (α: 3.1)
Epoch: 1 | Step: 600 | Train loss: 1.417 | Eval loss: 1.272 | Eval accuracy: 0.828 | ɛ: 3.51 (α: 3.0)
Epoch: 1 | Step: 700 | Train loss: 1.410 | Eval loss: 1.227 | Eval accuracy: 0.828 | ɛ: 3.56 (α: 3.0)
Epoch: 1 | Step: 800 | Train loss: 1.392 | Eval loss: 1.285 | Eval accuracy: 0.828 | ɛ: 3.62 (α: 3.0)
Epoch: 1 | Step: 900 | Train loss: 1.382 | Eval loss: 1.322 | Eval accuracy: 0.828 | ɛ: 3.67 (α: 3.0)
Epoch: 1 | Step: 1000 | Train loss: 1.376 | Eval loss: 1.260 | Eval accuracy: 0.82

  0%|          | 0/3824 [00:00<?, ?it/s]

Epoch: 2 | Step: 100 | Train loss: 1.189 | Eval loss: 1.298 | Eval accuracy: 0.709 | ɛ: 4.55 (α: 2.7)
Epoch: 2 | Step: 200 | Train loss: 1.241 | Eval loss: 1.316 | Eval accuracy: 0.641 | ɛ: 4.57 (α: 2.7)
Epoch: 2 | Step: 300 | Train loss: 1.364 | Eval loss: 1.236 | Eval accuracy: 0.660 | ɛ: 4.59 (α: 2.7)
Epoch: 2 | Step: 400 | Train loss: 1.338 | Eval loss: 1.270 | Eval accuracy: 0.719 | ɛ: 4.61 (α: 2.7)
Epoch: 2 | Step: 500 | Train loss: 1.346 | Eval loss: 1.311 | Eval accuracy: 0.661 | ɛ: 4.63 (α: 2.7)
Epoch: 2 | Step: 600 | Train loss: 1.350 | Eval loss: 1.252 | Eval accuracy: 0.690 | ɛ: 4.65 (α: 2.7)
Epoch: 2 | Step: 700 | Train loss: 1.359 | Eval loss: 1.264 | Eval accuracy: 0.714 | ɛ: 4.67 (α: 2.7)
Epoch: 2 | Step: 800 | Train loss: 1.349 | Eval loss: 1.308 | Eval accuracy: 0.648 | ɛ: 4.69 (α: 2.7)
Epoch: 2 | Step: 900 | Train loss: 1.346 | Eval loss: 1.331 | Eval accuracy: 0.557 | ɛ: 4.71 (α: 2.7)
Epoch: 2 | Step: 1000 | Train loss: 1.348 | Eval loss: 1.280 | Eval accuracy: 0.35

  0%|          | 0/3824 [00:00<?, ?it/s]

Epoch: 3 | Step: 100 | Train loss: 1.223 | Eval loss: 1.401 | Eval accuracy: 0.168 | ɛ: 5.20 (α: 2.6)
Epoch: 3 | Step: 200 | Train loss: 1.293 | Eval loss: 1.383 | Eval accuracy: 0.171 | ɛ: 5.22 (α: 2.6)
Epoch: 3 | Step: 300 | Train loss: 1.404 | Eval loss: 1.287 | Eval accuracy: 0.170 | ɛ: 5.23 (α: 2.6)
Epoch: 3 | Step: 400 | Train loss: 1.368 | Eval loss: 1.322 | Eval accuracy: 0.168 | ɛ: 5.25 (α: 2.6)
Epoch: 3 | Step: 500 | Train loss: 1.379 | Eval loss: 1.381 | Eval accuracy: 0.165 | ɛ: 5.26 (α: 2.6)
Epoch: 3 | Step: 600 | Train loss: 1.384 | Eval loss: 1.331 | Eval accuracy: 0.162 | ɛ: 5.28 (α: 2.6)
Epoch: 3 | Step: 700 | Train loss: 1.399 | Eval loss: 1.304 | Eval accuracy: 0.160 | ɛ: 5.29 (α: 2.6)
Epoch: 3 | Step: 800 | Train loss: 1.384 | Eval loss: 1.352 | Eval accuracy: 0.158 | ɛ: 5.31 (α: 2.6)
Epoch: 3 | Step: 900 | Train loss: 1.381 | Eval loss: 1.381 | Eval accuracy: 0.158 | ɛ: 5.32 (α: 2.6)
Epoch: 3 | Step: 1000 | Train loss: 1.384 | Eval loss: 1.312 | Eval accuracy: 0.16

In [16]:
from google.colab import files
PS_file = files.upload()

Saving prediction_score.csv to prediction_score.csv


In [17]:
ps_data = pd.read_csv('/content/prediction_score.csv')

In [18]:
ps_data

Unnamed: 0,Tweet,target
0,Corona time gains We have no control over the ...,0
1,This is how Chinese people caught Corona Virus...,0
2,an empty Escalator at M Purple Line station. W...,0
3,I am working away from home to earn a living a...,0
4,The virus is too dangerous and may last for ma...,0


In [19]:
ps_features = _df_to_features(ps_data, "test")
ps_dataset = _features_to_dataset(ps_features)
ps_sampler = SequentialSampler(ps_dataset)
ps_dataloader = DataLoader(ps_dataset, sampler=ps_sampler, batch_size=5)



In [20]:
for batch in ps_dataloader:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}

In [21]:
outputs = model(**inputs)
loss, logits = outputs[:2]
loss = outputs[0]
logits = torch.abs(outputs[1])
preds = np.argmax(logits.detach().cpu().numpy(), axis=1)



In [22]:
logits

tensor([[5.3435, 3.7709],
        [5.6878, 2.3265],
        [5.1261, 3.2730],
        [4.4799, 3.5603],
        [4.5701, 2.1722]], device='cuda:0', grad_fn=<AbsBackward0>)

In [23]:
preds

array([0, 0, 0, 0, 0])