In [None]:
import pandas as pd
df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', header=None, encoding='latin-1')
df = df.loc[:, [0, 5]]
df.columns = ['label', 'sentence']
df=df.sample(frac=1, random_state=0)[:1000]
print(df[df['label']==0].shape[0])
df

In [None]:
import re
from string import punctuation
def sentence_clean(x):
  x = x.lower() # lower capitals
  x = re.sub(r'@[A-Za-z0-9\._]*', '', x) # remove @XXX
  x = re.sub(r'[A-Za-z]+://[^\s]*', '', x) # remove XXX://XXX
  x = re.sub(r'[{}]+'.format(punctuation), '', x) # remove punctuation
  x = re.sub(r':\)', 'smile', x) # transfer :) to smile
  x = re.sub(r':\(', 'sad', x) # transfer :( to sad
  x = re.sub(r' +', ' ', x)
  return x.split() # token

df['token'] = df['sentence'].apply(sentence_clean)
df['text'] = df['token'].apply(lambda t: ' '.join(t))
df.head()

In [None]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW, RobertaForSequenceClassification, RobertaTokenizer
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn 

def seed(seed=1):
  torch.cuda.manual_seed(seed)
  torch.manual_seed(seed)
  torch.random.manual_seed(seed)
  np.random.seed(seed)

In [None]:
full_text = df.text.values
full_label = df.label.values

In [None]:
div = int(0.1 * full_text.shape[0])
p = np.random.permutation(full_text.shape[0])
shuffled_full_text, shuffled_full_label = full_text[p], full_label[p]
train_text, train_label = shuffled_full_text[div: ], shuffled_full_label[div: ]
dev_text, dev_label = shuffled_full_text[: div], shuffled_full_label[: div]

In [None]:
train_labels = []
dev_labels = []
label_dict = {0: 0, 4: 1}
for label in train_label:
  train_labels.append(label_dict[label])

for label in dev_label:
  dev_labels.append(label_dict[label])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
MODEL = 'roberta-large'
cache_dir = '/projects/cache'

class PModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.numClass = 2

    self.Model = RobertaForSequenceClassification.from_pretrained(MODEL, cache_dir=cache_dir)

  def forward(self, input_ids, attention_masks, token_type_ids):
    y = self.Model(input_ids=input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids).logits
    return y


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
def processdata(texts, labels):
  encoded_dict = tokenizer(list(texts),
                      max_length = 128,
                      pad_to_max_length = True,
                      return_attention_mask = True,
                      padding= 'max_length',
                      truncation= True,
                      return_tensors = 'pt',
                      return_token_type_ids = True,
                  )
  input_ids = encoded_dict['input_ids']
  attention_masks = encoded_dict['attention_mask']
  token_type_ids = encoded_dict['token_type_ids']
  labels = torch.tensor(labels)
  return input_ids, attention_masks, token_type_ids, labels

train_input_ids, train_attention_masks, train_token_type_ids, train_labels = processdata(train_text, train_labels)
dev_input_ids, dev_attention_masks, dev_token_type_ids, dev_labels = processdata(dev_text, dev_labels)

In [None]:
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_token_type_ids, train_labels, torch.arange(train_labels.size(0)))
dev_dataset = TensorDataset(dev_input_ids, dev_attention_masks, dev_token_type_ids, dev_labels, torch.arange(dev_labels.size(0)))

train_loader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size 
        )


dev_loader = DataLoader(
            dev_dataset, 
            sampler = SequentialSampler(dev_dataset), 
            batch_size = batch_size
        )

In [None]:
print(train_input_ids.shape)
print(train_attention_masks.shape)
print(train_token_type_ids.shape)
print(train_labels.shape)

print(len(train_loader))

In [None]:
model = PModel()
model.to(device)
model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
          )
EPOCHS = 20
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                          num_warmup_steps = 100,
                          num_training_steps = total_steps)


In [None]:
from tqdm import tqdm

total_step = len(train_loader)
print(train_input_ids.shape)
print('total step: ', total_step)
criteria = nn.CrossEntropyLoss()

best_dev_loss = 100
for epoch in range(EPOCHS):
  model.train()

  total_train_loss = 0
  total_train_acc  = 0
  for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in tqdm(enumerate(train_loader)):
    pair_token_ids = pair_token_ids.to(device)
    mask_ids = mask_ids.to(device)
    seg_ids = seg_ids.to(device)
    labels = y.to(device)

    optimizer.zero_grad()

    prediction = model(pair_token_ids, mask_ids, seg_ids)
    try:
      loss = criteria(prediction, labels)
    except:
      print('exception: too long a sentence, skipping the batch')
      continue
    
    total_train_loss += loss.item()

    loss.backward()

    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()


  train_loss = total_train_loss/len(train_loader)
  print('train loss: ', train_loss)

  # Put the model in evaluation mode
  model.eval()

  total_dev_loss = 0

  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in enumerate(dev_loader):

      #clear any previously calculated gradients before performing a backward pass
      optimizer.zero_grad()

      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      prediction = model(pair_token_ids, mask_ids, seg_ids)
      loss = criteria(prediction, labels)

      total_dev_loss += loss.item()

  # Calculate the average accuracy and loss over all of the batches.
  dev_loss = total_dev_loss/len(dev_loader)
  print('dev loss: ', dev_loss)
  if dev_loss < best_dev_loss:
    best_dev_loss = dev_loss
    print('saving checkpoint')
    torch.save(model, 'models/roberta-finetune.pth')
  else:
    print('skip saving checkpoint')

In [None]:
import pandas as pd
df = pd.read_csv('data/testdata.manual.2009.06.14.csv', header=None, encoding='latin-1')
df = df.loc[:, [0, 5]]
df.columns = ['label', 'sentence']
print(df[df['label']==0].shape[0])
df

In [None]:
df['token'] = df['sentence'].apply(sentence_clean)
df['text'] = df['token'].apply(lambda t: ' '.join(t))
df.head()

In [None]:
test_text = df.text.values
test_label_ = df.label.values

test_labels = []
for label in test_label_:
  test_labels.append(label_dict[label])

test_input_ids, test_attention_masks, test_token_type_ids, test_labels = processdata(test_text, test_labels)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_token_type_ids, test_labels, torch.arange(test_labels.size(0)))

test_loader = DataLoader(
            test_dataset,
            batch_size = batch_size 
        )

In [None]:
from torchmetrics import Accuracy
model = torch.load('models/roberta-finetune.pth')
model.eval()

test_acc = Accuracy(num_classes=2).to(device)
with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y, _) in enumerate(test_loader):

        #clear any previously calculated gradients before performing a backward pass

        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        prediction = model(pair_token_ids, mask_ids, seg_ids)
        test_acc(prediction.argmax(dim=1), labels)

test_acc = float(test_acc.compute())
print('Test accuracy: %.4f' % test_acc)