In [1]:
#!/usr/bin/env python3
import numpy as np
import torch
import lightning.pytorch as pl
import torchmetrics
import torchvision
from torchinfo import summary
from torchview import draw_graph
from IPython.display import display
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from transformers import GPT2Tokenizer, GPT2Model
from torch.utils.data import DataLoader,Dataset
import json
from torch.nn.functional import pad
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import requests

## Load the tokenizer and model of your choice ##

### GPT2 Tokenizer ###

In [2]:
model_gpt2 = GPT2Model.from_pretrained('gpt2')
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer_gpt2.add_special_tokens({'pad_token': '[PAD]'})

1

### BERT Tokenizer ###

In [3]:
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

### BART ###
Uncomment this if you want to see summaries of the sentences.

In [4]:
# Load pre-trained BART model and tokenizer
#tokenizer_bart = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
#model_bart = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [5]:
#Check system requirements
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
    print(torch.cuda.get_device_properties("cuda"))
    print("Number of devices:", torch.cuda.device_count())
    device = ("cuda")
else:
    print("Only CPU is available...")
    device = ("cpu")


Only CPU is available...


# Encoder Networks #

This network will encode the conversation using BERT/GPT2 as word embeddings to be passed into the classifier networks later. 

In [6]:

class encoderNetwork(Dataset):
    def __init__(self, conversations, targets, tokenizer, pretrainedModel):
        self.conversations = conversations  # data
        self.model = pretrainedModel  # Model choice
        self.targets = targets  # Target Labels
        self.tokenizer = tokenizer  # Tokenizer

    def __len__(self):
        return len(self.conversations)

    # Convert from list of sentence strings to one long string, encode the string
    def __getitem__(self, idx):

        if self.model == 'BERT':
            segmentedText = [f'{sentence} [SEP][CLS]' for sentence in self.conversations[idx]]
            text = " ".join(segmentedText)
        else: #GPT2
             text = " ".join(self.conversations[idx])
        
        encoding = self.tokenizer(text, padding=True, truncation=True, return_tensors="pt")

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        label = torch.tensor(self.targets[idx])

        return input_ids, attention_mask, label

# Classifier Networks #
Note: Each classifier network requires the corresponding pretrained model from above to initizlize. 

### GPT2 Classifier ###

In [7]:
# Classify the conversation with GPT2 as the base model
class GPT2ClassifierNetwork(pl.LightningModule):
    def __init__(self, gpt2_model=GPT2Model.from_pretrained('gpt2')):
        super(GPT2ClassifierNetwork, self).__init__()

        # Freeze GPT-2 weights
        #Done for space limitation on GPU
        for param in gpt2_model.parameters():
            param.requires_grad = False

        self.gpt2 = gpt2_model
        self.fc1 = nn.Linear(self.gpt2.config.hidden_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 2)
        self.loss_fn = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.classification.Accuracy(task='binary')

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs['last_hidden_state']

        # Take the mean of Last hidden states along the sequence dimension
        pooled_output = torch.mean(last_hidden_states, dim=1)

        # Apply linear layers
        x = F.relu(self.fc1(pooled_output))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)

        return logits

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, labels)

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = self.accuracy(preds, labels)

        # Log metrics
        self.log('train_acc', acc, on_step=False, on_epoch=True)
        self.log('train_loss', loss, on_step=False, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, labels)

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = self.accuracy(preds, labels)

        # Log metrics
        self.log('val_acc', acc, on_step=False, on_epoch=True)
        self.log('val_loss', loss, on_step=False, on_epoch=True)

        return {"val_loss": loss, "val_acc": acc}
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=2e-5)

### BERT Classifier ###

This classifier has an extra linear layer, uses the BERT specific pooler output instead of taking the average of the last hidden state like the GPT2 model, and could be loaded onto the GPU without freezing BERTS weights. Otherwise, it is the same as the GPT2 version.

In [8]:
#Classify the conversation
class BERTClassifierNetwork(pl.LightningModule):
    def __init__(self, bert_model=BertModel.from_pretrained('bert-base-uncased')):
        super(BERTClassifierNetwork, self).__init__()
        self.bert = bert_model
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)  # Binary classification, so output size is 2
        self.loss_fn = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.classification.Accuracy(task='binary')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']

        # Apply linear layers
        x = F.relu(self.fc1(pooled_output))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        logits = self.fc4(x)

        return logits

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, labels)

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = self.accuracy(preds, labels)

        # Log metrics
        self.log('train_acc', acc, on_step=False, on_epoch=True)
        self.log('train_loss', loss, on_step=False, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        logits = self(input_ids, attention_mask)
        loss = self.loss_fn(logits, labels)

        # Calculate accuracy
        preds = torch.argmax(logits, dim=1)
        acc = self.accuracy(preds, labels)

        # Log metrics
        self.log('val_acc', acc, on_step=False, on_epoch=True)
        self.log('val_loss', loss, on_step=False, on_epoch=True)

        return {"val_loss": loss, "val_acc": acc}

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=2e-5)

## Helper Functions ##

In [9]:
# Pad sequences within each batch got some errors on data size so this fixes that
#For Data loaders
def custom_collate_fn(batch):
    input_ids, attention_masks, labels = zip(*batch)

    # Determine the maximum length in the batch
    max_len = max(len(ids) for ids in input_ids)

    # Pad / truncate sequences to the max length using the padding token (0)
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    padded_attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return padded_input_ids, padded_attention_masks, torch.tensor(labels)

# Get the E-redial input data from the file
def get_input_data(filename):
    # Fetch jsonFile from URL
    response = requests.get(filename)
    
    # Check if the request was successful
    if response.status_code == 200:
        messages = json.loads(response.text)
    else:
        # Handle unsuccessful request (e.g., print an error message)
        print(f"Failed to fetch data from {filename}")
        return None
    #Open jsonFile ONLY if you have the file locally
    #with open(filename, 'r', encoding='utf-8') as f:
        #messages = json.load(f)

    #Loop through message and grab sentences
    wholeConv = {}                  #{[helperText],[HelperText]}
    forPretrainConv = []
    convRoleList = []
    idList = []
    
    #Will manage when a turn has occured
    seekerSpoken = False
    helperSpoken = False
    
    for message in messages:
        preservedOrderList = []
        totalHelperMssg = []
        totalSeekerMssg = []
        cur_roles = []
        workin = message['messages']
        id = message['conversationId']
        idList.append(id)
        for dictionary in workin:
            role = dictionary['role']
            sentence = dictionary['text']
            #Manage conversation turn tracking 
            if role == 1:
                totalHelperMssg.append(sentence)
                
            else: 
                totalSeekerMssg.append(sentence)
                #Adjust 0 role to 2 for NN processing later
                role = 2
            cur_roles.append(role)
            
            preservedOrderList.append(sentence) #Add sentence to this conversation

        #add conversation to list of conversations
        convRoleList.append(cur_roles)
        forPretrainConv.append(preservedOrderList)
        #Add info to conv containers
        wholeConv[hash(id)] = [totalHelperMssg, totalSeekerMssg]
        #joinedConversations = combineConsecutiveSpeakerSentences(forPretrainConv, convRoleList)
    #return forPretrainConv, convRoleList
    return wholeConv, idList,forPretrainConv, convRoleList #,joinedConversations

 
#Combine consectuive utterances into a single sentence so any seperation
#In the conversations are by different speakers
def combineConsecutiveSpeakerSentences(input_lists, roles_list):
    joined_strings_list = []

    #Input list is a whole conversation, role list is a parallel array with the
    #role order of that conversations
    for input_list, role_list in zip(input_lists, roles_list):
        joined_strings = []
        current_role = None
        current_string = ""

        #For each sentence and role in the conversation
        for text, role in zip(input_list, role_list):
            if current_role is None:
                # First iteration
                current_role = role
                current_string = text
            elif current_role == role:
                # Same role, concatenate the strings
                current_string += " " + text
            else:
                # Different role, add combined input to list, 
                # then start a new string
                joined_strings.append(current_string)
                current_role = role
                current_string = text

        # Append the last string
        joined_strings.append(current_string)
        joined_strings_list.append(joined_strings)

    return joined_strings_list

# Get target data from the file  # Data stored as convId,target
def get_target_data(filename):
    targets = []
    # Fetch jsonFile from URL
    response = requests.get(filename)
    
    # Check if the request was successful
    if response.status_code == 200:
        content = response.text
    else:
        # Handle unsuccessful request (e.g., print an error message)
        print(f"Failed to fetch data from {filename}")
        return None
    #with open(filename, 'r') as file:
    file = content.split('\n')
    for line in file:
        values = line.strip().split(',')
        if len(values) ==2: #ignore any empty lines
            #Ignore convID, just grab target label, convert to tensor
            targets.append(torch.tensor(int(values[1])))

    return np.array(targets)


# Function to pad conversations to a common length
def pad_conversations(conversations):
    #Get maxlength conversation, and pad each conversation so theyre the same length
    max_length = max(len(conv) for conv in conversations)
    padded_conversations = []
    for conv in conversations:
        padded_conv = conv + [''] * (max_length - len(conv))
        padded_conversations.append(padded_conv)
    return padded_conversations

#Show the bart summary of a conversation
def showBartSummary(idList, wholeConv):
    #Loop through every conversation in the list
    targetScores = {} #hash(ID), score 0/1 good/bad
    count = 0
    for x in range(2): #id in idList:
        id = idList[x]
        count+=1
        print('Working on id#',count)
        #Get current conversation, split into recommender and seeker (left/right)
        curConv = wholeConv[hash(id)]
        leftString = ' '.join(curConv[0])
        rightString = ' '.join(curConv[1])

        #Make summary of the conversation
        leftSummary = generateSummaryBart(leftString)
        rightSummary = generateSummaryBart(rightString)
        print(f'*****Conversation #{count}*****\n')
        print('Left Summary:\n', leftSummary)
        print('\nRight Summary:\n',rightSummary)
        print('\n\n')

#This function generate summaries using BART. 
def generateSummaryBart(text):
    inputs = tokenizer_bart(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model_bart.generate(**inputs, max_length=150, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True)
    return summary
    

# E-redial Data #

In [10]:
# Setup input files.
data = 'https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/test.json'

# Setup target files
targets = 'https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/E-redial-TEST-LABELS.txt'

#Get input data from file
wholeConv, idList, convData, convRoleList = get_input_data(data)  


testInput = combineConsecutiveSpeakerSentences(convData, convRoleList)

#Get target labels 
testLabels = get_target_data(targets)


What does the data look like?

In [11]:
#Show 2 conversations 
for convNum in range(2):
    print(f'\n\n******Conversation Number:{convNum}******')
    for utterance, sentence in enumerate(testInput[convNum]):
        print(f'Sentence #{utterance} -- {sentence}')
    print('\n')
    




******Conversation Number:0******
Sentence #0 -- Hello
Sentence #1 -- Hi! How are you?
Sentence #2 -- Good.
Sentence #3 -- What are you looking for today?
Sentence #4 -- Well, I enjoy romantic comedy, and chic movies! I really like [Steel Magnolias (1989)]. I've seen it a few times.
Sentence #5 -- As you enjoy romantic comedy, and chic movies,[Girls Trip (2017)] might be something you're interested in.When four lifelong friends travel to New Orleans for the annual Essence Festival, sisterhoods are rekindled,wild sides are rediscovered,and there's enough dancing drinking brawling and romancing to make the Big Easy blush.You may like it once you watched.
Sentence #6 -- Yes, I have heard great things about that one. I want to see it soon.
Sentence #7 -- Since you like romantic comedy, and chic movies , then I think these movie well suit your taste[Divine Secrets of the Ya-Ya Sisterhood (2002)]I really liked was that they DIDN'T fall back on that old chestnut of somebody dying to serve a

In [12]:
#Uncomment Only if you have loaded BART to look at summaries
#Show 2 conversation summaries
#showBartSummary(idList,wholeConv)

# Classifier Showcase #

In [13]:
TRAIN_BATCH_SIZE = 4
TEST_BATCH_SIZE = 5
NUM_EPOCHS = 60

### GPT2 ###

In [14]:
#Embed the data so it can be used by the classifiers
gpt2_test_dataset = encoderNetwork(testInput,  testLabels, tokenizer_gpt2, 'GPT2')
#Make data loaders
gpt2_test_dataloader = DataLoader(gpt2_test_dataset, 
                                  batch_size=TEST_BATCH_SIZE, 
                                  shuffle=False, collate_fn=custom_collate_fn, 
                                  num_workers=8)
#Make logger
gpt2_logger = pl.loggers.CSVLogger("lightning_logs", name="ClassifierTest", version="gpt2")

#Make trainer
gpt2_trainer = pl.Trainer(
        logger=gpt2_logger,
        max_epochs=NUM_EPOCHS,
        enable_progress_bar=True,
        log_every_n_steps=0,
        enable_checkpointing=True,
        callbacks=[pl.callbacks.TQDMProgressBar(refresh_rate=50)]
    )
#Make classifier network
gpt2_classifier_network = GPT2ClassifierNetwork.load_from_checkpoint("https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/gpt2-epoch=59-step=11340-v6.ckpt", map_location='cpu')

#Test code
gpt2_trainer.validate(gpt2_classifier_network, gpt2_test_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Downloading: "https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/gpt2-epoch=59-step=11340-v6.ckpt" to /home/jovyan/.cache/torch/hub/checkpoints/gpt2-epoch=59-step=11340-v6.ckpt
100%|██████████| 481M/481M [00:16<00:00, 30.3MB/s] 
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'val_acc': 0.6466666460037231, 'val_loss': 0.664914071559906}]

### BERT ###

In [15]:
#Embed the data so it can be used by the classifiers
bert_test_dataset = encoderNetwork(testInput,  testLabels, tokenizer_bert, 'BERT')

#Make data loaders
bert_test_dataloader = DataLoader(bert_test_dataset, 
                                  batch_size=TEST_BATCH_SIZE, 
                                  shuffle=False, collate_fn=custom_collate_fn,
                                  num_workers=2)

#Make logger
bert_logger = pl.loggers.CSVLogger("lightning_logs", name="ClassifierTest", version="bert")


#Make trainer
bert_trainer = pl.Trainer(
        logger=bert_logger,
        max_epochs=NUM_EPOCHS,
        enable_progress_bar=True,
        log_every_n_steps=0,
        enable_checkpointing=True,
        callbacks=[pl.callbacks.TQDMProgressBar(refresh_rate=50)]
    )

#Make classifier network
bert_classifier_network = BERTClassifierNetwork.load_from_checkpoint("https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/bert-epoch=59-step=11340-v6.ckpt", map_location='cpu')

#Test code
bert_trainer.validate(bert_classifier_network, bert_test_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Downloading: "https://f23deeplearning-eredial-classifier-demo.nyc3.digitaloceanspaces.com/bert-epoch=59-step=11340-v6.ckpt" to /home/jovyan/.cache/torch/hub/checkpoints/bert-epoch=59-step=11340-v6.ckpt
100%|██████████| 1.23G/1.23G [00:19<00:00, 66.4MB/s]
  rank_zero_warn(
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

[{'val_acc': 0.2199999988079071, 'val_loss': 0.744948148727417}]