In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## BERT+CNN Model
This model has been proposed in <https://link.springer.com/chapter/10.1007/978-3-030-36687-2_77 /> and impelemeted by [@ZeroxTM](https://github.com/ZeroxTM) in [this repository](https://github.com/ZeroxTM/BERT-CNN-Fine-Tuning-For-Hate-Speech-Detection-in-Online-Social-Media) 

By Alaa Grable

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import gc
from preprocessing import load_and_process
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from models import BERT_CNN

DATASET_PATH = '/content/drive/MyDrive/NLP-project/dataset/train.csv'

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_dataloader)
    
    return avg_loss

# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss= 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_CNN()

model = model.to(device)

# Adam optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1





Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.008

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.006

Training Loss: 0.008
Validation Loss: 0.006

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Training Loss: 0.005
Validation Loss: 0.007

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation
This model needs 6 GiB memory to evaluate the whole of test set, but GPU has not this size of memory as free memory because of bad fragmentation. So, I just evaluate the model on 3000 sequences of the test set.
because of inefficiency of this model in industery(high memory consumption), I didn't partition the input set for evaluate the model on the whole of test set.  

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq[:3000].to(device), test_mask[:3000].to(device))
    preds = preds.detach().cpu().numpy()


print("Performance:")
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y[:3000], preds))   

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1989
           1       0.90      0.86      0.88      1011

    accuracy                           0.92      3000
   macro avg       0.91      0.90      0.91      3000
weighted avg       0.92      0.92      0.92      3000



## BERT+CNN-1D Model


In [None]:
from models import BERT_CNN_1D

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / (len(train_dataloader)*batch_size)
    
    return avg_loss


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss= 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_CNN_1D()

model = model.to(device)

# Adam optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]


Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.006

Training Loss: 0.007
Validation Loss: 0.006

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Training Loss: 0.005
Validation Loss: 0.007

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis=1)
print("Performance:")
print("Classification Report:")
print(classification_report(test_y, preds))

Performance:
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      3245
           1       0.84      0.91      0.87      1623

    accuracy                           0.91      4868
   macro avg       0.90      0.91      0.90      4868
weighted avg       0.91      0.91      0.91      4868



## BERT+LSTM Model
This model has been proposed in <https://link.springer.com/chapter/10.1007/978-3-030-36687-2_77 />

In [None]:
from models import BERT_LSTM

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        #total_preds.append(preds.detach().cpu().numpy())

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_dataloader)
    
    return avg_loss


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss= 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())
            #total_preds.append(preds.detach().cpu().numpy())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_LSTM()

model = model.to(device)

# Adam optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1



Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.006

Training Loss: 0.007
Validation Loss: 0.006

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.008

Training Loss: 0.005
Validation Loss: 0.008

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()


print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      3245
           1       0.87      0.88      0.87      1623

    accuracy                           0.91      4868
   macro avg       0.90      0.91      0.90      4868
weighted avg       0.92      0.91      0.92      4868



## BERT+LSTM-CNN Model

In [None]:
from models import BERT_LSTM_CNN

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_dataloader)
    
    return avg_loss


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss= 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_LSTM_CNN()

model = model.to(device)

# Adam optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]


Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.008

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Training Loss: 0.008
Validation Loss: 0.007

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.008

Training Loss: 0.005
Validation Loss: 0.008

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()


print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))   

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      3245
           1       0.91      0.85      0.88      1623

    accuracy                           0.92      4868
   macro avg       0.92      0.90      0.91      4868
weighted avg       0.92      0.92      0.92      4868



## BERT+CNN-LSTM Model

In [None]:
from models import BERT_CNN_LSTM

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_dataloader)
    
    return avg_loss


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss = 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_CNN_LSTM()

model = model.to(device)

from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1



Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.008

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.006

Training Loss: 0.008
Validation Loss: 0.006

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Training Loss: 0.005
Validation Loss: 0.007

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()


print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      3245
           1       0.87      0.88      0.88      1623

    accuracy                           0.92      4868
   macro avg       0.91      0.91      0.91      4868
weighted avg       0.92      0.92      0.92      4868



## BERT+2CNN-1D Model

In [None]:
from models import BERT_2CNN_1D

def train():
    model.train()
    total_loss = 0
    total = len(train_dataloader)

    for i, batch in enumerate(train_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        model.zero_grad()
        preds = model(sent_id.to(device).long(), mask)
        loss = cross_entropy(preds, labels)
        total_loss += float(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / len(train_dataloader)
    
    return avg_loss


# function for evaluating the model
def evaluate():
    print("\n\nEvaluating...")
    model.eval()
    total_loss= 0
    total = len(val_dataloader)
    for i, batch in enumerate(val_dataloader):
        step = i+1
        percent = "{0:.3f}".format(100 * (step / float(total)))
        lossp = "{0:.3f}".format(total_loss/step)
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>' * (filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='')
        # push the batch to gpu
        batch = [t.to(device) for t in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        with torch.no_grad():
            preds = model(sent_id, mask)
            loss = cross_entropy(preds, labels)
            total_loss += float(loss.item())

    gc.collect()
    torch.cuda.empty_cache()
    val_avg_loss = total_loss / len(val_dataloader)

    return val_avg_loss

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the dataset
input_ids, attention_masks, labels = load_and_process(DATASET_PATH)
df = pd.DataFrame(list(zip(input_ids, attention_masks)), columns=['input_ids', 'attention_masks'])

train_text, temp_text, train_labels, temp_labels = train_test_split(df, labels,
                             random_state=42, test_size=0.2, stratify=labels)

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                         random_state=42, test_size=0.5, stratify=temp_labels)

del temp_text
gc.collect()
torch.cuda.empty_cache()

train_count = len(train_labels)
test_count = len(test_labels)
val_count = len(val_labels)


# for train set
train_seq = torch.tensor(train_text['input_ids'].tolist())
train_mask = torch.tensor(train_text['attention_masks'].tolist())
train_y = torch.tensor(train_labels.tolist())

# for validation set
val_seq = torch.tensor(val_text['input_ids'].tolist())
val_mask = torch.tensor(val_text['attention_masks'].tolist())
val_y = torch.tensor(val_labels.tolist())

# for test set
test_seq = torch.tensor(test_text['input_ids'].tolist())
test_mask = torch.tensor(test_text['attention_masks'].tolist())
test_y = torch.tensor(test_labels.tolist())

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

model = BERT_2CNN_1D()

model = model.to(device)

# Adam optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

cross_entropy = nn.NLLLoss()

epochs = 3
current = 1
# for each epoch
while current <= epochs:

    print(f'\nEpoch {current} / {epochs}:')

    # train model
    train_loss = train()

    # evaluate model
    valid_loss = evaluate()
    
    print(f'\n\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

    current = current + 1



Epoch 1 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.007

Training Loss: 0.007
Validation Loss: 0.007

Epoch 2 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.005

Evaluating...
Batch 153/153 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.006

Training Loss: 0.005
Validation Loss: 0.006

Epoch 3 / 3:
Batch 1217/1217 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.000% complete, loss=0.003

Evaluating...
Batch 153/153 |███████████████████████████████████████████████████████████████████

### Evaluation

In [None]:
gc.collect()
torch.cuda.empty_cache()

with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()


print("Performance:")
# model's performance
preds = np.argmax(preds, axis=1)
print('Classification Report')
print(classification_report(test_y, preds))

Performance:
Classification Report
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3245
           1       0.89      0.87      0.88      1623

    accuracy                           0.92      4868
   macro avg       0.91      0.91      0.91      4868
weighted avg       0.92      0.92      0.92      4868

