In [None]:
!pip install sentencepiece
!pip install transformers
!pip install emoji



In [None]:
import transformers
import emoji
from transformers import  AdamW, get_linear_schedule_with_warmup
import re
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification
import sentencepiece
RANDOM_SEED = 69  
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
torch.cuda.get_device_name(0)

'Tesla V100-SXM2-16GB'

In [None]:
emoji_data = pd.read_csv('/content/drive/MyDrive/Emogen/emogen_mask_70.csv')
emoji_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3071487 entries, 0 to 3071486
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Text           object
 1   Label          object
 2   Unmasked_Text  object
 3   Class_Text     object
dtypes: object(4)
memory usage: 93.7+ MB


In [None]:
class_names = emoji_data.Label.unique()
values = np.arange(51)
dictionary_emo_val = dict(zip(values, class_names))
dictionary_val_emo = dict(zip(class_names, values))
emoji_updated = emoji_data.replace(dictionary_val_emo)

In [None]:
PRE_TRAINED_MODEL_NAME = '/content/drive/MyDrive/Emogen/emogen_mask_base_debert'
tokenizer = DebertaV2Tokenizer.from_pretrained('/content/drive/MyDrive/Emogen/emogen_mask_base_tokenizer')
MAX_LEN = 64

In [None]:
class EmojiDataset(Dataset):

  def __init__(self, sentences, targets, tokenizer, max_len):
    self.sentences = sentences
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.sentences)
  
  def __getitem__(self, item):
    review = str(self.sentences[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = EmojiDataset(
    sentences=df.Class_Text.to_numpy(),
    targets=df.Label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
model = DebertaV2ForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = 51)

Some weights of the model checkpoint at /content/drive/MyDrive/Emogen/emogen_mask_base_debert were not used when initializing DebertaV2ForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForSequenceClassification were not initialized 

In [None]:
emoji_updated = emoji_updated.dropna()
df_train, df_test = train_test_split(emoji_updated, test_size=0.1, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
df_train.shape, df_val.shape, df_test.shape

((2764338, 4), (153574, 4), (153575, 4))

In [None]:
BATCH_SIZE = 64
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
#model = SentimentClassifier(len(class_names))
model = model.to(device)
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=1.5e-5, eps = 1e-6, weight_decay = .01)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [None]:
len(tokenizer)

128016

In [None]:
from tqdm.auto import tqdm
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
):
  model = model.train()

  losses = []
  correct_predictions = 0
  loop = tqdm(data_loader)
  for d in loop:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      labels = targets
    )

    _, preds = torch.max(outputs.logits, dim=1)
    loss = outputs.loss

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = targets
      )
      _, preds = torch.max(outputs.logits, dim=1)

      loss = outputs.loss

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
#checkpoint = torch.load('/content/drive/MyDrive/Emogen/class_checkpoint_3_28')
#optimizer = AdamW(model.parameters(), lr=1.5e-5, eps = 1e-6, weight_decay = .01)
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
%%time

best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  ) 
  torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, '/content/drive/MyDrive/Emogen/class_checkpoint_3_28')
  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    model.save_pretrained("/content/drive/MyDrive/Emogen/emogen_MLM_BASE_CLASS_debert")
    tokenizer.save_pretrained("/content/drive/MyDrive/Emogen/emogen_MLM_BASE_CLASS_tokenizer")
    best_accuracy = val_acc

Epoch 1/5
----------


  0%|          | 0/43193 [00:00<?, ?it/s]

Train loss 3.154821055178273 accuracy 0.18598340723891216
Val   loss 3.0620860676964123 accuracy 0.20790628622032375

Epoch 2/5
----------


  0%|          | 0/43193 [00:00<?, ?it/s]

Train loss 3.0113583147612726 accuracy 0.2167470114001978
Val   loss 3.024906112452348 accuracy 0.21671637126076027

Epoch 3/5
----------


  0%|          | 0/43193 [00:00<?, ?it/s]

Train loss 2.9338620243832483 accuracy 0.23256562692405922
Val   loss 3.017226280172666 accuracy 0.2204344485394663

Epoch 4/5
----------


  0%|          | 0/43193 [00:00<?, ?it/s]

Train loss 2.8730007400251716 accuracy 0.2447909770802268
Val   loss 3.010729952255885 accuracy 0.22266789951424068

Epoch 5/5
----------


  0%|          | 0/43193 [00:00<?, ?it/s]

Train loss 2.828152218091966 accuracy 0.25383039266544105
Val   loss 3.010549220144749 accuracy 0.22317579798663836

CPU times: user 17h 6min 48s, sys: 33min 17s, total: 17h 40min 6s
Wall time: 17h 36min 35s


In [None]:
torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, '/content/drive/MyDrive/Emogen/class_checkpoint_3_28')

In [None]:
val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn, 
    device, 
    len(df_val)
  )

print(f'Val   loss {val_loss} accuracy {val_acc}')

In [None]:
model.save_pretrained("emogen_class_small_debertxsmall")
tokenizer.save_pretrained("emogen_class_small_tokenizer")

In [None]:
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('/content/drive/MyDrive/Emogen/Emogen_test2emojis_smalldataset.bin'))
model = model.to(device)

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels = targets
      )


      outputs = model(
        input_ids=input_ids,
       attention_mask=attention_mask
      )
      _, preds = torch.max(outputs.logits, dim=1)

      predictions.extend(preds)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  real_values = torch.stack(real_values).cpu()
  return predictions, real_values

In [None]:
y_pred, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names, digits = 4))

              precision    recall  f1-score   support

           😭     0.1612    0.1704    0.1657      3221
           😂     0.1439    0.1657    0.1540      3228
           🤣     0.1666    0.1321    0.1474      3081
           🙃     0.1608    0.1149    0.1340      2968
           😶     0.1447    0.0851    0.1071      3068
           🥺     0.2396    0.3566    0.2866      3079
           😊     0.2486    0.3202    0.2799      3139
           🔥     0.3122    0.4378    0.3644      2940
           😍     0.2132    0.2499    0.2301      3125
           🥰     0.2105    0.1764    0.1920      3123
           😅     0.1290    0.0982    0.1116      3003
           🤪     0.1560    0.0929    0.1165      2895
           😱     0.2165    0.2218    0.2191      2962
           😆     0.1573    0.0833    0.1089      3015
           😇     0.3193    0.2280    0.2660      2996
           🤔     0.1959    0.3765    0.2577      2762
           🤧     0.1935    0.1186    0.1471      3203
           😃     0.2238    

In [None]:
def test_preds():
  text = builtins.input('Gimme a Sentence:   ') 
  x = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=150,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
      ).to(device)
  _, preds = torch.topk(model(x['input_ids'],x['attention_mask']),k=3,dim= 1)
  return class_names[preds.detach().cpu().numpy()[0]]

In [None]:
def test_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.topk(outputs,k=3,dim= 1)
      for index, target in enumerate(list(targets)):
          if target in preds[index]:
            correct_predictions += 1
      loss = loss_fn(outputs, targets)

      #correct_predictions += torch.sum(targets in preds)
      losses.append(loss.item())

  return float(correct_predictions) / n_examples, np.mean(losses)

In [None]:
test_acc, test_loss = test_model(
    model,
    test_data_loader,
    loss_fn, 
    device, 
    len(df_test)
  )