In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
torch.manual_seed(12345)
import numpy as np
np.random.seed(12345)
# specify GPU
device = torch.device("cuda")

In [None]:
import json
import re
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

STOPWORDS = set(stopwords.words('english'))
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in text.split() if word not in STOPWORDS])

def remove_abbriviation(text):
    text = text.replace("'m", " am")
    text = text.replace("'s", " is")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")  
    text = text.replace("won't", "will not")
    
    text = text.replace("'ve", " have")  
    text = text.replace("have't", "have not")
    
    text = text.replace("'d", " would")
    text = text.replace("'ve", " have")
    
    text = text.replace("don't", "do not")
    text = text.replace("did't", "did not")
    text = text.replace("can't", "can not")
    text = text.replace("couldn't", "could not")
    return text

def filtered(text):
    # text = text.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    # text = re.sub('\s+',' ',text)       #repalces repeated whitespace characters with single space

    # # text = re.sub('[/?@$->-_&]', '', text)
    # # text = re.sub('[$->-_&]', '', text)
    # text = remove_stopwords(text)
    # text = stem_words(text)
    
    # REFERENCE: https://stackoverflow.com/questions/28840908/perfect-regex-for-extracting-url-with-re-findall
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', "", text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = text.replace("@USER ", '')
    text = text.replace("<URL>", '')
    text = text.lower()
    text = remove_abbriviation(text)
    text = re.sub("[,.\"\'!@#$%^&*(){}+=-_?/;`~:<>\\\[\]]", "", text)
    return text.strip().lower()

def get_data():
    train_text = []
    train_labels = []
    test_text = []
    test_labels = []
    with open('data/train.jsonl') as json_file: 
        for i in json_file:
            data = json.loads(i)
            if (data['label']) == 'SARCASM':
              train_labels.append(1)
            else:
              train_labels.append(0)
            train_text.append(filtered(data["response"]))
            # train_data.append({'label': data['label'], 'text':filtered(data["response"])})
    with open('data/test.jsonl') as json_file: 
        for i in json_file:
            data = json.loads(i)
            test_labels.append(int(data['id'].split("_")[1]))
            test_text.append(filtered(data["response"]))
            # test_data.append({'id': int(data['id'].split("_")[1]), 'text':filtered(data["response"])})
    return train_text, train_labels, test_text, test_labels
train_text, train_labels, test_text, test_labels = get_data()
eval_text = train_text[:500] + train_text[-500:]
eval_labels = train_labels[:500] + train_labels[-500:]
train_text = train_text[500:-500]
train_labels = train_labels[500:-500]

print(train_text[0])

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
max_seq_len = 80

In [None]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# tokenize and encode sequences in the training set
tokens_train = tokenizer(
    train_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)
tokens_eval = tokenizer(
    eval_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer(
    test_text,
    max_length = max_seq_len,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids=False
)

In [None]:

# for train set
train_seq = torch.tensor(tokens_train['input_ids'])
print(train_seq.shape)
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels).to(device)
print(train_y.shape)

eval_seq = torch.tensor(tokens_eval['input_ids'])
print(eval_seq.shape)
eval_mask = torch.tensor(tokens_eval['attention_mask'])
eval_y = torch.tensor(eval_labels).to(device)
print(eval_y.shape)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
print(train_seq[1])

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


eval_data = TensorDataset(eval_seq, eval_mask, eval_y)

# sampler for sampling the data during training
eval_sampler = RandomSampler(eval_data)

# dataLoader for train set
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)


In [None]:
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.5)
      self.dropout1 = nn.Dropout(0.5)
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,256)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(256,2)
      # self.fc3 = nn.Linear(256,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      output= self.bert(sent_id, attention_mask=mask)
      x = self.fc1(self.dropout1(output.last_hidden_state[:,0,:]))

      x = self.relu(x)

      x = self.dropout(x)
      # x = self.dropout(self.relu(self.fc2(x)))
      # x = self.dropout(self.relu(self.fc3(x)))
      # x = self.dropout(self.relu(self.fc4(x)))
      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

In [None]:
model = BERT_Arch(bert)
criterion = nn.NLLLoss() 
# push the model to GPU
model = model.to(device)


In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=0.0005, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.001)

In [None]:
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  total_labels = []
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 100 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch
    # outputs = model(sent_id, attention_mask=mask, labels=labels)
    optimizer.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)
    # compute the loss between actual and predicted values
    loss = criterion(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds.argmax(1))
    total_labels.append(labels.cpu().numpy())
  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)
  #returns the loss and predictions
  return avg_loss, total_preds, np.concatenate(total_labels, axis=0)



def evaluate_eval(model):
  
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  total_labels = []
  # iterate over batches
  for step,batch in enumerate(eval_dataloader):
    
    # progress update after every 50 batches.
    if step % 100 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(eval_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch
    
    preds = model(sent_id, mask)


    preds=preds.detach().cpu().numpy()


    total_preds.append(preds.argmax(1))
    total_labels.append(labels.cpu().numpy())

  
  total_preds  = np.concatenate(total_preds, axis=0)

  return total_preds, np.concatenate(total_labels, axis=0)

In [None]:
from sklearn.metrics import f1_score
epochs= 20
best_valid_loss = float('inf')
best_f1 = -10
train_losses=[]

#for each epoch
for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss, total_preds, total_labels = train()
    print("train_acc", sum(total_preds == total_labels)/total_labels.shape[0])
    preds, eval_labels = evaluate_eval(model)
    curr_f1 = f1_score(eval_labels, preds)
    print("eval_f1", curr_f1)
    print("eval_acc", sum(eval_labels == preds)/preds.shape[0])
    if curr_f1 > best_f1:
      torch.save(model, "/data/model12345.pt")
      best_f1 = curr_f1
    # torch.save(model, "/data/model123.pt")
    # append training and validation loss
    train_losses.append(train_loss)
    print(f'\nTraining Loss: {train_loss:.3f}')

In [None]:
from sklearn.metrics import f1_score
def evaluate(model):
  
  model.eval()
  preds = model(test_seq.to(device), test_mask.to(device))

  preds=preds.detach().argmax(1).cpu().numpy()

  #returns the loss and predictions
  return preds

In [None]:

model = torch.load("/data/model12345.pt")
# get predictions for test data
preds, eval_labels = evaluate_eval(model)
curr_f1 = f1_score(eval_labels, preds)
print("eval_f1", curr_f1)
print("eval_acc", sum(eval_labels == preds)/preds.shape[0])
with torch.no_grad():
  preds = evaluate(model)

In [None]:

assert len(preds) == len(test_labels)

In [None]:
f = open("answer.txt", 'w+')
for i in range(len(test_labels)):
    if preds[i] == 0:
        curr_pred = "NOT_SARCASM"
    else:
        curr_pred = "SARCASM"
    f.write('twitter_{},{}\n'.format(test_labels[i], curr_pred))
    print('twitter_{},{}\n'.format(test_labels[i], curr_pred))
f.close()
