In [1]:
# Importing libs
import re
import nltk
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef
from tqdm.notebook import trange

In [2]:
data = pd.read_csv('comments_clear.csv')

In [3]:
# Setting up default seeds
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Device = gpu
device = torch.device("cuda")

In [6]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6).to(device)
model.load_state_dict(torch.load('trained_model.pt'))
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
# Importing BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Prepare our data to feed it to the model
batch_size = 16
MAX_LEN = 128

input_comms_1 = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True) for sent in data.body.values[:100000]]
input_comms_1 = torch.tensor(input_comms_1).type(torch.LongTensor)
att_masks_comms_1 = [[float(i>0) for i in seq] for seq in input_comms_1]
att_masks_comms_1 = torch.tensor(att_masks_comms_1).type(torch.LongTensor)

input_1 = TensorDataset(input_comms_1, att_masks_comms_1)
input_1_dataloader = DataLoader(input_1, shuffle=False, batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [24]:
# Feeding data to our model to get predictions
results = []
for step, batch in enumerate(input_1_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    results.append(output.logits.softmax(dim=-1).tolist())

In [25]:
c1 = []
for a in results:
    for b in a:
        c1.append(b.index(max(b)))

In [26]:
# int to label dict
int_to_label = {
  4: "sadness",
  2: "joy",
  0: "anger",
  1: "fear",
  5: "surprise",
  3: "love"
}

data['enc_emote'] = c1
data['emote'] = list(map(lambda x: int_to_label[x], c1))
data.to_csv('comments_emotions.csv', index=False)

In [28]:
data2 = data[['link_id', 'emote']]

In [30]:
data2.to_csv('comments_emotions.csv', index=False)