In [1]:
import os, shutil
os.environ["CUDA_VISIBLE_DEVICES"]="2, 3"
# os.environ["CUDA_LAUNCH_BLOCKING"]="1"
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision
import torchvision.utils as vutils
from torch.autograd import Variable
from tqdm.notebook import tqdm_notebook
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms
import random
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from PIL import Image
import cv2
import time
from tqdm.notebook import tqdm_notebook
import time
import matplotlib.animation as animation
from mpl_toolkits.axes_grid1 import ImageGrid
import sys, time
from nibabel.testing import data_path
import nibabel as nib
from PIL import Image
import scipy.ndimage as ndi
import itertools
import time
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import tqdm
import nltk
nltk.download('punkt')
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

[nltk_data] Downloading package punkt to /ifshome/rgaggar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# FOR REPRODUCIBILTY
torch.manual_seed(100)
np.random.seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False 
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(100)

<torch._C.Generator at 0x7f0f71cd37f0>

In [3]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
device = torch.device("cuda")

True
2


In [4]:
data = pd.read_csv("FinalDataset.csv")
data.head()

Unnamed: 0,sentence,class
0,What city in the United States has the highest...,0
1,"At work, wishing I was out on the boat",0
2,A smile is a curve that sets everything straig...,0
3,Does sleep quality mediate the association bet...,0
4,What city was found on the west bank of the ri...,0


In [5]:
data['class'].value_counts()

0    400015
1    376930
Name: class, dtype: int64

In [6]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

In [7]:
print('Train:', len(train_data))
print('Val:', len(val_data))
print('Test:', len(test_data))

Train: 629325
Val: 69925
Test: 77695


In [8]:
train_data['class'].value_counts()

0    324086
1    305239
Name: class, dtype: int64

In [9]:
val_data['class'].value_counts()

0    36099
1    33826
Name: class, dtype: int64

In [10]:
test_data['class'].value_counts()

0    39830
1    37865
Name: class, dtype: int64

In [11]:
# CREATE DATASET CLASS FOR DATALOADERS
class Dataset(Dataset):
    def __init__(self, dataframe):
        self.texts = dataframe['sentence'].values.tolist()
        self.labels = dataframe['class'].values.tolist()
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        X = self.texts[idx]
        y = self.labels[idx]
        X = X.lower()
        tokens = nltk.word_tokenize(X, language="english")
        X = " ".join(tokens)
        X = X.strip()
        
        return X, y

In [12]:
class transformer(nn.Module):
    def __init__(self, base_model):
        super(transformer, self).__init__()

        self.bert = base_model
        self.fc = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids,
                             attention_mask=attention_mask)[0][:, 0]
        x = self.fc(bert_out)        
        x = self.sigmoid(x)

        return x

In [13]:
BERT_MODEL = "roberta-large"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
base_model = AutoModel.from_pretrained(BERT_MODEL)

model = transformer(base_model)
model = torch.nn.DataParallel(model)
model = model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def collate_fn(batch):    
    x, y = [], []
    for text, label in batch:
        x.append(text)
        y.append(label)
    padded_text = tokenizer(x, padding=True, truncation=True, return_tensors='pt')
    
    return padded_text, torch.FloatTensor(y)

In [15]:
train_dataloader = DataLoader(Dataset(train_data), batch_size=20, shuffle=True, num_workers=4, collate_fn=collate_fn, pin_memory=True, worker_init_fn=seed_worker, generator=g)
val_dataloader = DataLoader(Dataset(val_data), batch_size=20, num_workers=4, collate_fn=collate_fn, pin_memory=True, worker_init_fn=seed_worker, generator=g)

In [16]:
for train_input, train_label in train_dataloader:
    print(np.array(train_input['input_ids']).shape)
    break

(20, 227)


In [17]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [18]:
count_train = 0
count_test = 0
best_val_loss = float('inf')
early_stopping_threshold_count = 0

for epoch in range(5):
    total_acc_train = 0
    total_loss_train = 0

    model.train()

    for train_input, train_label in tqdm(train_dataloader):
        
        attention_mask = train_input['attention_mask'].to(device)
        input_ids = train_input['input_ids'].squeeze(1).to(device)
        train_label = train_label.to(device)

        output = model(input_ids, attention_mask)

        loss = criterion(output, train_label.float().unsqueeze(1))

        total_loss_train += loss.item()
        
        pred = (output >= 0.5).int()
        label = train_label.unsqueeze(1)
        try:
            auc_roc = roc_auc_score(label.cpu(), pred.cpu())
        except:
            count_train += 1
            auc_roc = 0
        total_acc_train += auc_roc

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    with torch.no_grad():
        total_acc_val = 0
        total_loss_val = 0

        model.eval()

        for val_input, val_label in val_dataloader:
            attention_mask = val_input['attention_mask'].to(device)
            input_ids = val_input['input_ids'].squeeze(1).to(device)

            val_label = val_label.to(device)

            output = model(input_ids, attention_mask)

            loss = criterion(output, val_label.float().unsqueeze(1))

            total_loss_val += loss.item()

            pred = (output >= 0.5).int()
            label = val_label.unsqueeze(1)
            try:
                auc_roc = roc_auc_score(label.cpu(), pred.cpu())
            except:
                count_test += 1
                auc_roc = 0
            total_acc_val += auc_roc

        print("Epochs:", epoch + 1)
        print("Train Loss:", total_loss_train/len(train_dataloader))
        print("Train AUCROC:", total_acc_train/(len(train_dataloader)-count_train))
        print("Val Loss:", total_loss_val/len(val_dataloader))
        print("Val AUCROC:", total_acc_val/(len(val_dataloader)-count_test))

        if best_val_loss > total_loss_val:
            best_val_loss = total_loss_val
            torch.save(model, f"roberta-large.pt")
            print("Saved model")
            early_stopping_threshold_count = 0
        else:
            early_stopping_threshold_count += 1

        if early_stopping_threshold_count >= 1:
            print("Early stopping")
            break
            
        count_train = 0
        count_test = 0
            
        print("====================================================================================")

100%|██████████| 31467/31467 [4:49:47<00:00,  1.81it/s]  


Epochs: 1
Train Loss: 0.14732785197870177
Train AUCROC: 0.9397453103053991
Val Loss: 0.13147977263514754
Val AUCROC: 0.9524559160825116


  0%|          | 0/31467 [00:00<?, ?it/s]

Saved model


100%|██████████| 31467/31467 [4:53:12<00:00,  1.79it/s]  


Epochs: 2
Train Loss: 0.09923866626995069
Train AUCROC: 0.9608894660308552
Val Loss: 0.12800786583858825
Val AUCROC: 0.9544972547838452


  0%|          | 0/31467 [00:00<?, ?it/s]

Saved model


100%|██████████| 31467/31467 [4:51:20<00:00,  1.80it/s]  


Epochs: 3
Train Loss: 0.0739321762049123
Train AUCROC: 0.9711811176650829
Val Loss: 0.1326957078462018
Val AUCROC: 0.9576071873797672
Early stopping
