# **Data Preprocessing**

In [1]:
import torch
import torchvision
import transformers
import pandas as pd
import numpy as np

path = "/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv"

train_df = pd.read_csv(path, encoding='latin1')
train_df.Sentiment.replace(to_replace='Extremely Positive', value='Positive',inplace=True)
train_df.Sentiment.replace(to_replace='Extremely Negative', value='Negative',inplace=True)
print(train_df.Sentiment.value_counts())
corona_train = train_df[['OriginalTweet','Sentiment']]

Sentiment
Positive    18046
Negative    15398
Neutral      7713
Name: count, dtype: int64


In [2]:
import re
import random

def process_text(text):
    text = text.lower()
    text = re.sub("https*\S+", "[ URL ]", text)
    # remove extra spaces
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = re.sub('[^a-zA-Z\s]', '', text)
    return text

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(114514)
corona_train.OriginalTweet = corona_train.OriginalTweet.apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corona_train.OriginalTweet = corona_train.OriginalTweet.apply(process_text)


In [3]:
train_list = []
test_list = []

text_id = 'OriginalTweet'
target_id = 'Sentiment'
# Print unique values in the 'Sentiment' column
unique_values = corona_train[target_id].unique()
print("Unique values in 'Sentiment' column:", unique_values)

# Create a dictionary mapping each unique value to an index
result_dict = {value: index for index, value in enumerate(unique_values, 0)}
print("Resulting dictionary:", result_dict)


def get_list(train_df):
    train_list = []
    for i in range(len(train_df)):
        text = train_df.loc[i][text_id]
        text = text.split(" ")
        text_len = text.__len__()
        origin_text = ""
        for a in text:
            if a != " ":
                origin_text += (a.replace(" ", "") + " ")

        one = {
            'id': i + 1,
            'text': origin_text,
            'text_length': text_len,
            'target': result_dict[train_df.loc[i][target_id]]
        }
        train_list.append(one)

    return train_list

train_data = get_list(corona_train)
random.shuffle(train_data)
length = len(train_data)
train_ratio, dev_ratio = 0.9, 0.1
train_dataset = train_data[:int(train_ratio * length) ]
dev_dataset = train_data[int(train_ratio * length): ]

import torch
import numpy
import pandas as pd

# Define dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['target']
        return text, label

trainD = Dataset(train_dataset)
devD = Dataset(dev_dataset)

len(trainD), trainD[0]
a = []
for i in trainD:
    a.append(len(i[0].split(" ")))

std = numpy.std(a)
mean = numpy.mean(a)
maxlen, minlen = numpy.max(a), numpy.min(a)

# Print standard deviation and mean
print("Standard Deviation:", std)
print("Mean:", mean)

# Print maximum length and minimum length
print("Maximum Length:", maxlen)
print("Minimum Length:", minlen)

Unique values in 'Sentiment' column: ['Neutral' 'Positive' 'Negative']
Resulting dictionary: {'Neutral': 0, 'Positive': 1, 'Negative': 2}
Standard Deviation: 11.437030662258575
Mean: 32.68024621365514
Maximum Length: 65
Minimum Length: 3


# **BERT Tokenizer**

In [4]:
from transformers import BertTokenizer
model_id = "bert-base-uncased"
token = BertTokenizer.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
seq_len = 128

def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [int(i[1]) for i in data]
    # Encode
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=seq_len,
                                   return_tensors='pt',
                                   return_length=True).to(device)
    # input_ids: encoded numbers
    # attention_mask: is zero-padded position is 0, other position is 1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)
    return input_ids, attention_mask, token_type_ids, labels

# Data loader
loader = torch.utils.data.DataLoader(dataset=trainD,
                                     batch_size=64,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=False)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# **Bert-RCNN model**

In [6]:
from transformers import BertModel
from accelerate import Accelerator
from datasets import load_metric
import accelerate
from sklearn.metrics import confusion_matrix, classification_report, f1_score

model_id = 'bert-base-uncased'

class TModel(torch.nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.pretrained = BertModel.from_pretrained(model_id)
        self.lstm = torch.nn.LSTM(input_size=768, bidirectional=True, hidden_size=768, num_layers=1,
                                  batch_first=False)  # LSTM model
        self.pooling = torch.nn.MaxPool1d(kernel_size=seq_len)  # Max pooling layer
        self.fc = torch.nn.Sequential(
            torch.nn.Dropout(0.25),
            torch.nn.Linear(768 * 3, num_classes),  # No for softmax function, because the cross-entropy loss function in pytorch already included softmax.
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        out = self.pretrained(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              output_hidden_states=False)
        hidden_states = out[0]  # Sentence feature vector through Bert [1]
        pooler_out = out[1] # [CLS] [2]
        aim = hidden_states.permute(1, 0, 2) # [seq, batchsize, dim]
        deep_mean, _ = self.lstm(aim)
        deep_mean = deep_mean.permute(1, 0, 2)   # [batchsize, seq, dim]
        deep_mean = deep_mean.permute(0, 2, 1)   # [batchsize, dim, seq]
        out = self.pooling(deep_mean).squeeze(-1)
        out = torch.cat([pooler_out, out], dim=-1)
        out = self.fc(out)
        return out

model = TModel().to(device)
accelerator = Accelerator()
accelerate.__version__

'0.25.0'

# Model training and validation


In [7]:
# Import necessary libraries
from transformers import AdamW
from sklearn.metrics import f1_score, recall_score, accuracy_score
import torch

# Set the number of epochs and configure the optimizer
epoch = 10
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=1e-4)
criterion = torch.nn.CrossEntropyLoss()
acc_max = 0.0
model_save_dir = 'saved_Corona_Bert_RCNN.pth'

# Prepare the model, optimizer, and data loader for distributed training
model, optimizer, loader = accelerator.prepare(model, optimizer, loader)

# Create a dev data loader
loader_dev = torch.utils.data.DataLoader(dataset=devD,
                                          batch_size=64,
                                          collate_fn=collate_fn,
                                          shuffle=False,
                                          drop_last=False)

# Load the 'mrpc' metric from the 'glue' dataset
states = load_metric('glue', 'mrpc')

# Lists to store metrics and losses during training
epochs_list = []
accuracies_list = []
recalls_list = []
f1_list = []
train_losses = []
test_losses = []

# Training loop
for j in range(epoch):
    model.train()
    epochs_list.append(j + 1)
    total_train_loss = 0.0
    total_train_hits = 0
    nums = 0
    pre_labels = []
    real_labels = []

    # Iterate over the training data loader
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids).to(device)

        # Compute and backpropagate the loss
        loss = criterion(out.to(device), labels.to(device))
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

        # Calculate training accuracy
        out = out.argmax(dim=1)
        hit_nums = (out.cpu() == labels.cpu()).sum().item()
        total_train_hits += hit_nums

        accuracy = hit_nums / len(labels)
        pre_labels.extend(out.cpu().detach())
        real_labels.extend(labels.cpu().detach())

        nums += len(labels)
        total_train_loss += loss.item()

        # Print intermediate training metrics
        # if i % 100 == 0:
        #     print("No.{:} step:".format(i))
        #     print(i, loss.item(), accuracy)

    # Calculate and print training metrics
    cur_train_loss = total_train_loss / len(loader)
    cur_train_acc = accuracy_score(y_true=real_labels, y_pred=pre_labels)
    print("Train  loss: {:.3f} acc : {:.3f}".format(cur_train_loss, cur_train_acc))
    train_losses.append(cur_train_loss)

    # Evaluation on the validation set
    model.eval()
    correct = 0
    total = 0
    total_loss = 0.0
    pre_val_labels = []
    real_val_labels = []
    count = 0

    # Iterate over the validation data loader
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_dev):
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids).to(device)
            loss = criterion(out.to(device), labels.to(device))
            count += 1
            total_loss += loss

        # Calculate validation accuracy
        out = out.argmax(dim=1)
        correct += (out.cpu().detach() == labels.cpu().detach()).sum().item()
        total += len(labels)

        pre_val_labels.extend(out.cpu().detach())
        real_val_labels.extend(labels.cpu().detach())

    # Calculate and print validation metrics
    cur_dev_acc = accuracy_score(y_true=real_val_labels, y_pred=pre_val_labels)
    cur_dev_f1 = f1_score(y_true=real_val_labels, y_pred=pre_val_labels, average='weighted')
    cur_dev_recall = recall_score(y_true=real_val_labels, y_pred=pre_val_labels, average='weighted')

    accuracies_list.append(cur_dev_acc)
    f1_list.append(cur_dev_f1)
    recalls_list.append(cur_dev_recall)

    # Print classification report
    cr = classification_report(y_true=real_val_labels, y_pred=pre_val_labels, digits=4)
    print(cr)

    # Update the model if validation accuracy improves
    cur_acc = cur_dev_acc
    if cur_acc > acc_max:
        acc_max = cur_acc
        print("Saving Model...")
        torch.save(model.state_dict(), model_save_dir)

    avg_loss = total_loss / count
    test_losses.append(avg_loss)

    print("Epoch {:}  loss: {:.3f} acc : {:.3f}".format(j + 1, avg_loss, cur_acc)) # Dev Result



Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Train  loss: 0.466 acc : 0.825
              precision    recall  f1-score   support

           0     0.9449    0.7605    0.8427       789
           1     0.9259    0.8857    0.9054      1793
           2     0.8301    0.9557    0.8885      1534

    accuracy                         0.8878      4116
   macro avg     0.9003    0.8673    0.8788      4116
weighted avg     0.8939    0.8878    0.8871      4116

Saving Model...
Epoch 1  loss: 0.324 acc : 0.888
Train  loss: 0.241 acc : 0.921
              precision    recall  f1-score   support

           0     0.9562    0.8023    0.8725       789
           1     0.9176    0.9314    0.9244      1793
           2     0.8819    0.9394    0.9097      1534

    accuracy                         0.9096      4116
   macro avg     0.9186    0.8910    0.9022      4116
weighted avg     0.9117    0.9096    0.9090      4116

Saving Model...
Epoch 2  loss: 0.271 acc : 0.910
Train  loss: 0.162 acc : 0.948
              precision    recall  f1-score   s

# Test

In [17]:
from sklearn.metrics import precision_recall_fscore_support

path = "/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv"

test_df = pd.read_csv(path, encoding='latin1')
test_df.Sentiment.replace(to_replace='Extremely Positive', value='Positive',inplace=True)
test_df.Sentiment.replace(to_replace='Extremely Negative', value='Negative',inplace=True)
print(test_df.Sentiment.value_counts())
corona_test = test_df[['OriginalTweet','Sentiment']]

corona_test.OriginalTweet = corona_test.OriginalTweet.apply(process_text)
test_data = get_list(test_df)
testD = Dataset(test_data)

# Testing
pre_val_labels = []  # Predicted labels list
real_val_labels = []  # True labels list

def test():
    # Load the pre-trained model
    save_path = "./saved_Corona_Bert_RCNN.pth"
    model = TModel().to(device)
    model.load_state_dict(torch.load(save_path))
    model.eval()

    # Create a data loader for the test dataset
    loader_test = torch.utils.data.DataLoader(dataset=testD,
                                              batch_size=1,
                                              collate_fn=collate_fn,
                                              shuffle=False,
                                              drop_last=False)

    correct = 0  # Counter for correct predictions
    total = 0  # Counter for total samples

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids).to(device)

        out = out.argmax(dim=1)
        correct += (out.cpu().detach() == labels.cpu().detach()).sum().item()
        total += len(labels)
        pre_val_labels.extend(out.cpu().detach())  # Extend the predicted labels list
        real_val_labels.extend(labels.cpu().detach())  # Extend the true labels list

    # Display classification report
    cr = classification_report(y_true=real_val_labels, y_pred=pre_val_labels, digits=4)
    print(cr)
    

# Execute the test function
test()

Sentiment
Negative    1633
Positive    1546
Neutral      619
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corona_test.OriginalTweet = corona_test.OriginalTweet.apply(process_text)


              precision    recall  f1-score   support

           0     0.8965    0.7835    0.8362       619
           1     0.8656    0.9334    0.8982      1546
           2     0.9170    0.8928    0.9047      1633

    accuracy                         0.8915      3798
   macro avg     0.8930    0.8699    0.8797      3798
weighted avg     0.8927    0.8915    0.8909      3798

