In [2]:
import torch
import torch.nn as nn
torch.manual_seed(12345)
import numpy as np
np.random.seed(12345)
device = torch.device("cuda")

In [3]:
import json
import re

# REMOVE CERTAIN ABBREVIATIONS
def remove_abbreviation(text):
    text = text.replace("'m", " am")
    text = text.replace("'s", " is")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")  
    text = text.replace("won't", "will not")
    
    text = text.replace("'ve", " have")  
    text = text.replace("have't", "have not")
    
    text = text.replace("'d", " would")
    text = text.replace("'ve", " have")
    
    text = text.replace("don't", "do not")
    text = text.replace("did't", "did not")
    text = text.replace("can't", "can not")
    text = text.replace("couldn't", "could not")
    return text

def filtered(text):
    # REFERENCE: https://stackoverflow.com/questions/28840908/perfect-regex-for-extracting-url-with-re-findall
    # REMOVE URLS 
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "", text)
    # REMOVE @...(@USERS)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    # REMOVE ENDINGS
    text = text.replace("<URL>", '')
    text = text.lower()
    text = remove_abbreviation(text)
    # REMOVE PUNCTUATIONS
    text = re.sub("[,.\"\'!@#$%^&*(){}+=-_?/;`~:<>\\\[\]]", "", text)
    return text.strip().lower()

def get_data():
    train_text = []
    train_labels = []
    test_text = []
    test_labels = []
    with open('data/train.jsonl') as json_file: 
        for i in json_file:
            data = json.loads(i)
            if (data['label']) == 'SARCASM':
              train_labels.append(1)
            else:
              train_labels.append(0)
            train_text.append(filtered(data["response"]))

    with open('data/test.jsonl') as json_file: 
        for i in json_file:
            data = json.loads(i)
            test_labels.append(int(data['id'].split("_")[1]))
            test_text.append(filtered(data["response"]))
    return train_text, train_labels, test_text, test_labels


train_text, train_labels, test_text, test_labels = get_data()
# GET EVAL TEXT AND LABELS
eval_text = train_text[:500] + train_text[-500:]
eval_labels = train_labels[:500] + train_labels[-500:]
# GET TRAIN TEXT AND LABELS
train_text = train_text[500:-500]
train_labels = train_labels[500:-500]

print(train_text[0])

when i reminded him that i am a woman  he complained that he didnt feel listened to


In [4]:
from transformers import DistilBertModel, DistilBertTokenizer
# LOAD PRE-TRAINED BERT
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [5]:
max_seq_len = 80

In [6]:
# REFERENCE: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

# INTIALIZE PRE-TRAINED TOKENIZER
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# TOKENIZE TRAIN
tokens_train = tokenizer(
    train_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# TOKENIZE VALIDATION
tokens_eval = tokenizer(
    eval_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# TOKENIZE TEST
tokens_test = tokenizer(
    test_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)



In [7]:
# REFERENCE: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

# GET IDS AND MASK FOR TRAIN, VALIDATION and TEST
train_seq = torch.tensor(tokens_train['input_ids'])
print(train_seq.shape)
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels).to(device)
print(train_y.shape)

eval_seq = torch.tensor(tokens_eval['input_ids'])
print(eval_seq.shape)
eval_mask = torch.tensor(tokens_eval['attention_mask'])
eval_y = torch.tensor(eval_labels).to(device)
print(eval_y.shape)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
print(train_seq[1])

torch.Size([4000, 80])
torch.Size([4000])
torch.Size([1000, 80])
torch.Size([1000])
tensor([  101,  4922,  8040, 21886,  2480,  7164, 24829,  5602,  2003,  3424,
         1011, 19640, 24106,  2903, 24829,  5602,  5836, 16498,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])


In [8]:
# REFERENCE: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32

# CONSTRUCT DATA LOADER
train_data = TensorDataset(train_seq, train_mask, train_y)

train_sampler = RandomSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


eval_data = TensorDataset(eval_seq, eval_mask, eval_y)

eval_sampler = RandomSampler(eval_data)

eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)


In [9]:
# FREEZE BERT PARAMETERS
for param in bert.parameters():
    param.requires_grad = False

In [10]:
# ACTUAL MODEL FOR FINE-TUNE
class BERT_FT(nn.Module):

    def __init__(self, bert):
      
      super(BERT_FT, self).__init__()
      self.bert = bert 
    
      self.dropout = nn.Dropout(0.5)
      self.relu =  nn.ReLU()
    
      self.fc1 = nn.Linear(768,256)
      self.fc2 = nn.Linear(256,2)

      self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
      output= self.bert(sent_id, attention_mask=mask)
      x = self.fc1(self.dropout(output.last_hidden_state[:,0,:]))
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)

      return x

In [11]:
model = BERT_FT(bert)
model = model.to(device)

In [12]:
from transformers import AdamW
# DEFINE OPTIMIZER
optimizer = AdamW(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.001)
criterion = nn.NLLLoss() 

In [13]:
# REFERENCE: https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0

  total_preds=[]
  total_labels = []
  for step,batch in enumerate(train_dataloader):
    batch = [r.to(device) for r in batch]
    
    # GET BATCH DATA
    sent_id, mask, labels = batch
    optimizer.zero_grad()        

    preds = model(sent_id, mask)
    # GET PREDICTION LOGITS
    loss = criterion(preds, labels)
    total_loss = total_loss + loss.item()
    # LOSS BACK PROPAGATION
    loss.backward()

    optimizer.step()
    preds=preds.detach().cpu().numpy()

    total_preds.append(preds.argmax(1))
    total_labels.append(labels.cpu().numpy())
  avg_loss = total_loss / len(train_dataloader)
  
  total_preds  = np.concatenate(total_preds, axis=0)
  return avg_loss, total_preds, np.concatenate(total_labels, axis=0)



def evaluate_eval(model):
  
  model.eval()

  total_loss, total_accuracy = 0, 0
  total_preds=[]
  total_labels = []
  for step,batch in enumerate(eval_dataloader):
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch
    
    preds = model(sent_id, mask)


    preds=preds.detach().cpu().numpy()


    total_preds.append(preds.argmax(1))
    total_labels.append(labels.cpu().numpy())

  
  total_preds  = np.concatenate(total_preds, axis=0)

  return total_preds, np.concatenate(total_labels, axis=0)

In [14]:
from sklearn.metrics import f1_score
epochs= 20
best_valid_loss = float('inf')
best_f1 = -10
train_losses=[]
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss, total_preds, total_labels = train()
    print("train_acc", sum(total_preds == total_labels)/total_labels.shape[0])
    preds, eval_labels = evaluate_eval(model)
    curr_f1 = f1_score(eval_labels, preds)
    print("eval_f1", curr_f1)
    print("eval_acc", sum(eval_labels == preds)/preds.shape[0])
    if curr_f1 > best_f1:
      torch.save(model, "model123456.pt")
      best_f1 = curr_f1
    # append training and validation loss
    train_losses.append(train_loss)
    print(f'\nTraining Loss: {train_loss:.3f}')


 Epoch 1 / 20
  Batch   100  of    125.
train_acc 0.68875
eval_f1 0.694048616932104
eval_acc 0.635


  "type " + obj.__name__ + ". It won't be checked "



Training Loss: 0.579

 Epoch 2 / 20


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import f1_score
def evaluate(model):
  # GET TEST PREDICTION
  model.eval()
  preds = model(test_seq.to(device), test_mask.to(device))
  preds=preds.detach().argmax(1).cpu().numpy()
  return preds

In [None]:
# LOAD BEST MODEL
model = torch.load("model123456.pt")
# GET PREDICTIONS FOR VALIDATION SET
preds, eval_labels = evaluate_eval(model)
curr_f1 = f1_score(eval_labels, preds)
print("eval_f1", curr_f1)
print("eval_acc", sum(eval_labels == preds)/preds.shape[0])
with torch.no_grad():
  # GET PREDICTIONS FOR TEST SET
  preds = evaluate(model)

In [None]:
assert len(preds) == len(test_labels)

In [None]:
# OUTPUT TO ANSWER.TXT
f = open("answer.txt", 'w+')
for i in range(len(test_labels)):
    if preds[i] == 0:
        curr_pred = "NOT_SARCASM"
    else:
        curr_pred = "SARCASM"
    f.write('twitter_{},{}\n'.format(test_labels[i], curr_pred))
    print('twitter_{},{}\n'.format(test_labels[i], curr_pred))
f.close()
