In [35]:
import pandas as pd
df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', header=None, encoding='latin-1')
df = df.loc[:, [0, 5]]
df.columns = ['label', 'sentence']
df=df.sample(frac=1, random_state=0)[:1000]
print(df[df['label']==0].shape[0])
df

505


Unnamed: 0,label,sentence
557138,0,wants to compete! i want hard competition! i w...
349381,0,It seems we are stuck on the ground in Amarill...
182051,0,where the f are my pinking shears? rarararrrar...
571236,0,0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER...
1339637,4,@ reply me pls
...,...,...
235274,0,Sitting with a temper bed cover over my bed Re...
122382,0,ouch just got a nasty burn
419988,0,Lost my headphones tonight.. Lame 1 week til...
241280,0,THE HILLS SEASON FNALE...tonight


In [36]:
import re
from string import punctuation
def sentence_clean(x):
  x = x.lower() # lower capitals
  x = re.sub(r'@[A-Za-z0-9\._]*', '', x) # remove @XXX
  x = re.sub(r'[A-Za-z]+://[^\s]*', '', x) # remove XXX://XXX
  x = re.sub(r'[{}]+'.format(punctuation), '', x) # remove punctuation
  x = re.sub(r':\)', 'smile', x) # transfer :) to smile
  x = re.sub(r':\(', 'sad', x) # transfer :( to sad
  x = re.sub(r' +', ' ', x)
  return x.split() # token

df['token'] = df['sentence'].apply(sentence_clean)
df['text'] = df['token'].apply(lambda t: ' '.join(t))
df.head()

Unnamed: 0,label,sentence,token,text
557138,0,wants to compete! i want hard competition! i w...,"[wants, to, compete, i, want, hard, competitio...",wants to compete i want hard competition i wan...
349381,0,It seems we are stuck on the ground in Amarill...,"[it, seems, we, are, stuck, on, the, ground, i...",it seems we are stuck on the ground in amarill...
182051,0,where the f are my pinking shears? rarararrrar...,"[where, the, f, are, my, pinking, shears, rara...",where the f are my pinking shears rarararrrara...
571236,0,0ff t0 tHE MEEtiN.. i HAtE WhEN PPl V0lUNtEER...,"[0ff, t0, the, meetin, i, hate, when, ppl, v0l...",0ff t0 the meetin i hate when ppl v0lunteer my...
1339637,4,@ reply me pls,"[reply, me, pls]",reply me pls


In [37]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW, BertConfig, BertTokenizer, BertForMaskedLM
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn 

def seed(seed=1):
  torch.cuda.manual_seed(seed)
  torch.manual_seed(seed)
  torch.random.manual_seed(seed)
  np.random.seed(seed)

In [38]:
full_text = df.text.values
full_label = df.label.values

for i in range(full_text.shape[0]):
  full_text[i] += " [SEP] this sentence sentiment is [MASK]"

In [39]:
div = int(0.1 * full_text.shape[0])
p = np.random.permutation(full_text.shape[0])
shuffled_full_text, shuffled_full_label = full_text[p], full_label[p]
train_text, train_label = shuffled_full_text[div: ], shuffled_full_label[div: ]
dev_text, dev_label = shuffled_full_text[: div], shuffled_full_label[: div]

In [40]:
train_labels = []
dev_labels = []
label_dict = {0: 0, 4: 1}
for label in train_label:
  train_labels.append(label_dict[label])

for label in dev_label:
  dev_labels.append(label_dict[label])

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
MODEL = 'bert-large-uncased-whole-word-masking'

class PModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.numClass = 2

    self.MaskModel = BertForMaskedLM.from_pretrained(MODEL).bert
    self.MaskModel.config.output_hidden_states=True
    self.tokenizer = BertTokenizer.from_pretrained(MODEL)
    self.lastLayer = nn.Sequential(nn.Linear(self.MaskModel.config.hidden_size, self.numClass), nn.Tanh())
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, input_ids, attention_masks, token_type_ids):
    mask_indices = torch.where(self.tokenizer.mask_token_id == input_ids)
    last_hidden_states = self.MaskModel(input_ids=input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids).hidden_states[-1][mask_indices] # (bz, hidden_size) if only one mask
    y = self.lastLayer(last_hidden_states)
    y = self.softmax(y)
    return y


In [43]:
from transformers.models.bert.tokenization_bert import BertTokenizer
from transformers.models.bert.tokenization_bert_fast import BertTokenizerFast
from transformers import AutoTokenizer
tokenizer:BertTokenizerFast = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
def processdata(texts, labels):
  encoded_dict = tokenizer(list(texts),
                      max_length = 128,
                      pad_to_max_length = True,
                      return_attention_mask = True,
                      padding= 'max_length',
                      truncation= True,
                      return_tensors = 'pt',
                      return_token_type_ids = True,
                  )
  input_ids = encoded_dict['input_ids']
  attention_masks = encoded_dict['attention_mask']
  token_type_ids = encoded_dict['token_type_ids']
  labels = torch.tensor(labels)
  return input_ids, attention_masks, token_type_ids, labels

train_input_ids, train_attention_masks, train_token_type_ids, train_labels = processdata(train_text, train_labels)
dev_input_ids, dev_attention_masks, dev_token_type_ids, dev_labels = processdata(dev_text, dev_labels)

In [44]:
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_token_type_ids, train_labels, torch.arange(train_labels.size(0)))
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_token_type_ids, dev_labels, torch.arange(dev_labels.size(0)))

train_loader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size 
        )


dev_loader = DataLoader(
            dev_dataset, 
            sampler = SequentialSampler(dev_dataset), 
            batch_size = batch_size
        )

In [45]:
print(train_input_ids.shape)
print(train_attention_masks.shape)
print(train_token_type_ids.shape)
print(train_labels.shape)

print(len(train_loader))

torch.Size([900, 128])
torch.Size([900, 128])
torch.Size([900, 128])
torch.Size([900])
57


In [46]:
model = PModel()
model.to(device)
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
          )
EPOCHS = 10
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                          num_warmup_steps = 100,
                          num_training_steps = total_steps)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [47]:
from tqdm import tqdm

total_step = len(train_loader)
print(train_input_ids.shape)
print('total step: ', total_step)
criteria = nn.CrossEntropyLoss()

best_dev_loss = 100
for epoch in range(EPOCHS):
  model.train()

  total_train_loss = 0
  total_train_acc  = 0
  for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in tqdm(enumerate(train_loader)):
    pair_token_ids = pair_token_ids.to(device)
    mask_ids = mask_ids.to(device)
    seg_ids = seg_ids.to(device)
    labels = y.to(device)

    optimizer.zero_grad()

    prediction = model(pair_token_ids, mask_ids, seg_ids)
    try:
      loss = criteria(prediction, labels)
    except:
      print('exception: too long a sentence, skipping the batch')
      continue
    
    total_train_loss += loss.item()

    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()


  train_loss = total_train_loss/len(train_loader)
  print('train loss: ', train_loss)

  # Put the model in evaluation mode
  model.eval()

  total_dev_loss = 0

  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in enumerate(dev_loader):

      #clear any previously calculated gradients before performing a backward pass
      optimizer.zero_grad()

      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      prediction = model(pair_token_ids, mask_ids, seg_ids)
      loss = criteria(prediction, labels)

      total_dev_loss += loss.item()

  # Calculate the average accuracy and loss over all of the batches.
  dev_loss = total_dev_loss/len(dev_loader)
  print('dev loss: ', dev_loss)
  if dev_loss < best_dev_loss:
    best_dev_loss = dev_loss
    print('saving checkpoint')
    torch.save(model, 'models/bert-mlm.pth')
  else:
    print('skip saving checkpoint')

torch.Size([900, 128])
total step:  57


57it [00:12,  4.67it/s]


train loss:  0.6570307361452203
dev loss:  0.5654321823801313
saving checkpoint


57it [00:12,  4.65it/s]


train loss:  0.5438366119276014
dev loss:  0.5427388335977282
saving checkpoint


57it [00:12,  4.64it/s]


train loss:  0.4971362796791813
dev loss:  0.5403973119599479
saving checkpoint


57it [00:12,  4.63it/s]


train loss:  0.4768465459346771
dev loss:  0.5469534184251513
skip saving checkpoint


57it [00:12,  4.63it/s]


train loss:  0.4621815634401221
dev loss:  0.5369722587721688
saving checkpoint


57it [00:12,  4.63it/s]


train loss:  0.4506804320895881
dev loss:  0.5498600474425724
skip saving checkpoint


57it [00:12,  4.62it/s]


train loss:  0.44493296533300164
dev loss:  0.5514935382774898
skip saving checkpoint


57it [00:12,  4.60it/s]


train loss:  0.43085008755064846
dev loss:  0.5323380700179509
saving checkpoint


57it [00:12,  4.61it/s]


train loss:  0.4292616561839455
dev loss:  0.5175131218773978
saving checkpoint


57it [00:12,  4.61it/s]


train loss:  0.4258292542214979
dev loss:  0.536018852676664
skip saving checkpoint


In [48]:
import pandas as pd
df = pd.read_csv('data/testdata.manual.2009.06.14.csv', header=None, encoding='latin-1')
df = df.loc[:, [0, 5]]
df.columns = ['label', 'sentence']
print(df[df['label']==0].shape[0])
df

177


Unnamed: 0,label,sentence
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...
...,...,...
354,4,"After using LaTeX a lot, any other typeset mat..."
355,0,"On that note, I hate Word. I hate Pages. I hat..."
356,4,Ahhh... back in a *real* text editing environm...
357,0,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [49]:
df['token'] = df['sentence'].apply(sentence_clean)
df['text'] = df['token'].apply(lambda t: ' '.join(t))
df.head()

Unnamed: 0,label,sentence,token,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...,"[i, loooooooovvvvvveee, my, kindle2, not, that...",i loooooooovvvvvveee my kindle2 not that the d...
1,4,Reading my kindle2... Love it... Lee childs i...,"[reading, my, kindle2, love, it, lee, childs, ...",reading my kindle2 love it lee childs is good ...
2,4,"Ok, first assesment of the #kindle2 ...it fuck...","[ok, first, assesment, of, the, kindle2, it, f...",ok first assesment of the kindle2 it fucking r...
3,4,@kenburbary You'll love your Kindle2. I've had...,"[youll, love, your, kindle2, ive, had, mine, f...",youll love your kindle2 ive had mine for a few...
4,4,@mikefish Fair enough. But i have the Kindle2...,"[fair, enough, but, i, have, the, kindle2, and...",fair enough but i have the kindle2 and i think...


In [50]:
test_text = df.text.values
test_label_ = df.label.values

for i in range(test_text.shape[0]):
  test_text[i] += " [SEP] this sentence sentiment is [MASK]"

test_labels = []
for label in test_label_:
  test_labels.append(label_dict[label])

teset_input_ids, test_attention_masks, test_token_type_ids, test_labels = processdata(test_text, test_labels)

test_dataset = TensorDataset(teset_input_ids, test_attention_masks, test_token_type_ids, test_labels, torch.arange(test_labels.size(0)))

test_loader = DataLoader(
            test_dataset,
            batch_size = batch_size 
        )

In [51]:
from torchmetrics import Accuracy
model = torch.load('models/bert-mlm.pth')
model.eval()

test_acc = Accuracy(num_classes=2).to(device)
with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in enumerate(test_loader):

        #clear any previously calculated gradients before performing a backward pass

        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        prediction = model(pair_token_ids, mask_ids, seg_ids).argmax(dim=1)
        test_acc(prediction, labels)

test_acc = float(test_acc.compute())
print('Test accuracy: %.4f' % test_acc)

Test accuracy: 0.8273


: 