### Import libraries

In [1]:
# Import libraries
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from nltk.translate import bleu_score
from rouge_score import rouge_scorer
from nltk.util import ngrams
from torch.nn.utils.rnn import pad_sequence
from matplotlib import pyplot as plt

# Set device to CUDA if available
device = "cuda" if torch.cuda.is_available else "cpu"

# Set random seed for consistency
rand_seed = 7
torch.manual_seed(rand_seed)
random.seed(rand_seed)
np.random.seed(rand_seed)
torch.cuda.manual_seed_all(rand_seed)

print(device)

cuda


### Load dataset

In [2]:
# Import DailyDialog dataset
train_dataset = datasets.load_dataset("daily_dialog", split="train")
test_dataset = datasets.load_dataset("daily_dialog", split="test")
val_dataset = datasets.load_dataset("daily_dialog", split="validation")

Using custom data configuration default
Reusing dataset daily_dialog (C:\Users\leoma\.cache\huggingface\datasets\daily_dialog\default\1.0.0\c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)
Using custom data configuration default
Reusing dataset daily_dialog (C:\Users\leoma\.cache\huggingface\datasets\daily_dialog\default\1.0.0\c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)
Using custom data configuration default
Reusing dataset daily_dialog (C:\Users\leoma\.cache\huggingface\datasets\daily_dialog\default\1.0.0\c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c)


In [3]:
# Check datasets
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 11118
})
Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 1000
})
Dataset({
    features: ['dialog', 'act', 'emotion'],
    num_rows: 1000
})


In [4]:
# Show an example of train data
train_dataset[0]

{'dialog': ['Say , Jim , how about going for a few beers after dinner ? ',
  ' You know that is tempting but is really not good for our fitness . ',
  ' What do you mean ? It will help us to relax . ',
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . ',
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ',
  " Good.Let ' s go now . ",
  ' All right . '],
 'act': [3, 4, 2, 2, 2, 3, 4, 1, 3, 4],
 'emotion': [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]}

In [5]:
# Print the counts of conversation lengths for training set
train_dialog_len = [len(dialog) for dialog in train_dataset['dialog']]
pd.Series(train_dialog_len).value_counts().sort_index()

2      483
3      386
4     1980
5      942
6     1183
7      805
8     1069
9      717
10     966
11     436
12     977
13     272
14     262
15     158
16     154
17      94
18      63
19      34
20      48
21      27
22      12
23      13
24      11
25       4
26       7
27       1
28       3
29       5
30       2
31       1
32       2
35       1
dtype: int64

In [6]:
# Print the counts of conversation lengths for test set
test_dialog_len = [len(dialog) for dialog in test_dataset['dialog']]
pd.Series(test_dialog_len).value_counts().sort_index()

2      42
3      40
4     177
5      90
6     117
7      55
8      94
9      62
10     93
11     47
12     81
13     22
14     29
15     10
16     18
17      7
18      5
19      5
20      3
21      2
26      1
dtype: int64

In [125]:
# Print a dialog
test_dataset['dialog'][11]

['Are you busy tomorrow morning ? ',
 " I'm free . What's up ? ",
 ' Someone has to pick up the boss at the airport . ',
 " Oh , I just remembered I've got a report to write . "]

### Evaluation metrics

#### BLEU-1,2,4

In [4]:
"""
Function to calculate BLEU-1,2,4 score

Input:
NOTE: All sentence must be SPLIT INTO WORDS
ref_list - list of reference sentences, e.g. []
hypo - hypothesis sentence

Return:
BLEU-1 score, BLEU-2 score, BLEU-4 score
"""
# Function to calculate BLEU score
# Input: 
def calc_bleu_score(ref_list, prediction):
    weights = [
         (1, 0),
         (1./3., 1./3., 1./3.),
         (1./4., 1./4., 1./4., 1./4.)
    ]
    bleu_scores = bleu_score.sentence_bleu(ref_list, prediction, weights) 
    return bleu_scores

#### ROUGE-1,2,L

In [5]:
"""
Function to calculate ROUGE-1,2,L score

Input:
NOTE: All sentence must be in FULL-SENTENCE STRING format
target - target sentence
prediction - prediction sentence

Return:
Dict{'rouge1': ..., 'rouge2': ..., 'rougeL': ...}
each key corresponds to:
[0] - precision
[1] - recall
[2] - f1-score
"""

def calc_rouge_score(target, prediction):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(target, prediction)
    return scores

#### Distinct-1,2

In [6]:
"""
Function to calculate Distinct-1,2 score

Input:
NOTE: All sentence must be SPLIT INTO WORDS
predictions - a LIST of prediction sentences (from all test data predictions)
n - specify n in n-grams

Return:
distinct score - float

"""

def calc_distinct_score(predictions, n):
    # Get ngrams
    ngram_list = []
    for prediction in predictions:
        line_ngrams = list(ngrams(prediction, n))
        ngram_list = ngram_list + line_ngrams
    
    # Initialize variables
    appeared_ngram = []
    total = 0
    distinct = 0
    
    # Count number of distinct ngrams
    for ngram in ngram_list:
        total += 1
        if ngram not in appeared_ngram:
            distinct += 1
            appeared_ngram.append(ngram)
    
    return distinct / total

### DialoGPT

### Evaluate pre-trained model over the evaluation metrics

#### Get evaluation metrics from pre-trained DialoGPT without any fine-tuning

In [7]:
# Function to tokenize and flatten an input conversation into one single list
def tokenize_flatten_conv(conv, tokenizer):
    flatten = lambda large_list: [item for sublist in large_list for item in sublist]
    conversation = [tokenizer.encode(line) + [tokenizer.eos_token_id] for line in conv] # Tokenize
    conversation = flatten(conversation)                                                # Flatten
    return conversation

In [8]:
# Function to calculate mean score from a list of scores
def calc_mean_scores(scores_list):
    return np.mean(np.array(scores_list))

# Function to calculate evaluation metrics on a given model and dataset
# Mode: 'conv' - predict utterance based on previous conversation history
#       'sep'  - predict utterance based on only the previous utterance
def eval_model_on_metrics(model, tokenizer, test_dataset, mode):

    # Initialize lists to store scores and generated texts
    bleu_1_scores = []
    bleu_2_scores = []
    bleu_4_scores = []
    rouge_1_scores = []
    rouge_2_scores = []
    rouge_L_scores = []
    distinct = []

    # Loop through each dialogue in test dataset
    for dialogue in tqdm(test_dataset, desc="Evaluation of model", unit="dialogues"):
        for i in range(0, len(dialogue), 2):
                
            # If the input sentence is the last sentence, skip the prediction
            if i == len(dialogue)-1:
                continue
                
            # mode 'conv' (conversation): Use uttr 0 to predict uttr 1, use uttr 0+1+2 to predict uttr 3, ...    
            if mode == 'conv':
                for j in range(i+1):
                    # Add EOS token to the end of each input utterance
                    new_input = dialogue[j] + tokenizer.eos_token

                    # Convert the sentence into tensor of token ids with the tokenizer
                    new_input_ids = tokenizer.encode(new_input, return_tensors='pt').to(device)

                    # Concat the new input utterance to the chat history if applicable
                    chat_input_ids = torch.cat([chat_input_ids, new_input_ids], dim=-1) if j > 0 else new_input_ids
                    
            # mode 'sep' (separate): Use uttr 0 to predict uttr 1, use uttr 2 to predict uttr 3, ...
            elif mode == 'sep':
                # Only use one line to predict
                new_input = dialogue[i] + tokenizer.eos_token
                
                # Convert the sentence into tensor of token ids with the tokenizer
                chat_input_ids = tokenizer.encode(new_input, return_tensors='pt').to(device)
            
            # Get the output token ids from the model
            output_ids = model.generate(chat_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
            
            # Convert the output token ids back into a sentence
            new_output = tokenizer.decode(output_ids[:, chat_input_ids.shape[-1]:][0], skip_special_tokens=True)
        
            # Get the target utterance, tokenized output & tokenized target utterance
            target_utterance = dialogue[i+1]
            
            # Split the sentence into words
            new_output_split = new_output.split()
            target_utterance_split = target_utterance.split()
        
            # Calculate BLEU-1,2,4 scores and save into lists
            bleu_scores = calc_bleu_score([target_utterance_split], new_output_split)
            bleu_1_scores.append(bleu_scores[0])
            bleu_2_scores.append(bleu_scores[1])
            bleu_4_scores.append(bleu_scores[2])
        
            # Calculate ROUGE-1,2,L scores and save into lists
            rouge_scores = calc_rouge_score(target_utterance, new_output)
            rouge_1_scores.append(rouge_scores['rouge1'][2])
            rouge_2_scores.append(rouge_scores['rouge2'][2])
            rouge_L_scores.append(rouge_scores['rougeL'][2])
        
            # Save the tokenized output into a list for future Distinct score calculation
            distinct.append(new_output_split)

    # Calculate Distinct-1,2 scores with the saved tokenized outputs
    distinct_1_score = calc_distinct_score(distinct, 1)
    distinct_2_score = calc_distinct_score(distinct, 2)
    
    # Print the evaluation metrics
    print(f"Mean BLEU-1 score: {calc_mean_scores(bleu_1_scores)}")
    print(f"Mean BLEU-2 score: {calc_mean_scores(bleu_2_scores)}")
    print(f"Mean BLEU-4 score: {calc_mean_scores(bleu_4_scores)}")
    print(f"Mean ROUGE-1 F-score: {calc_mean_scores(rouge_1_scores)}")
    print(f"Mean ROUGE-2 F-score: {calc_mean_scores(rouge_2_scores)}")
    print(f"Mean ROUGE-L F-score: {calc_mean_scores(rouge_L_scores)}")
    print(f"Distinct-1 score: {distinct_1_score}")
    print(f"Distinct-2 score: {distinct_2_score}")

In [100]:
# Function to get model response output for testing, given a LIST of utterances in a conversation history
def chatbot(chat_history, model, tokenizer):
    step = 0
    for uttr in chat_history:
        new_input = uttr + tokenizer.eos_token
        new_input_ids = tokenizer.encode(new_input, return_tensors='pt').to(device)
        chat_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if step > 0 else new_input_ids
    chat_history_ids = model.generate(chat_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    new_output = tokenizer.decode(chat_history_ids[:, chat_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return new_output

#### Load tokenizer and model

In [128]:
# Import tokenizer & pre-trained DialoGPT model
dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small").to(device)

# Set the padding token to be EOS token
dialogpt_tokenizer.pad_token = dialogpt_tokenizer.eos_token
#dialogpt_model.config.pad_token_id = dialogpt_model.config.eos_token_id

In [130]:
# Test dialogue inputs
test_dialogue1 = [' Can I help you ? ',]
test_dialogue2 = ['How are you today ? ',
 ' Great , thanks . ',
 ' Can I help you ? ',]
test_dialogue3 = [' All right , young man . Tell me how it got started . ']
test_dialogue4 = ["Good morning . What's the matter with you ? ",
 ' Good morning , doctor . I have a terrible headache . ',
 ' All right , young man . Tell me how it got started . ',]

In [13]:
# Evaluate performance of the model
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "conv")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [12:25<00:00,  1.34dialogues/s]


Mean BLEU-1 score: 0.02312607321774004
Mean BLEU-2 score: 0.0012744197886526104
Mean BLEU-4 score: 0.0004805906812546459
Mean ROUGE-1 F-score: 0.09395238010630211
Mean ROUGE-2 F-score: 0.01525255405873308
Mean ROUGE-L F-score: 0.08921185684966101
Distinct-1 score: 0.095533003036599
Distinct-2 score: 0.23878256859580355


In [14]:
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "sep")

Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [15:45<00:00,  1.06dialogues/s]


Mean BLEU-1 score: 0.030905711424916316
Mean BLEU-2 score: 0.0013275425407620158
Mean BLEU-4 score: 0.00031980848622221417
Mean ROUGE-1 F-score: 0.11009679650523381
Mean ROUGE-2 F-score: 0.01757537246256902
Mean ROUGE-L F-score: 0.10206082506446801
Distinct-1 score: 0.08359971202303816
Distinct-2 score: 0.21850120870265916


In [132]:
print(chatbot(test_dialogue1, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue2, dialogpt_model, dialogpt_tokenizer))

I'm not sure what to do with this information.
I'm not sure what to do with this information.


In [131]:
# Try to get model-generated responses
print(chatbot(test_dialogue3, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue4, dialogpt_model, dialogpt_tokenizer))

I'm a young man, and I'm not sure what you're talking about.
I'm a young man, and I'm not sure what you're talking about.


#### Try larger DialoGPT model

In [133]:
# Free memory
try:
    del dialogpt_model
    del dialogpt_tokenizer
except NameError:
    pass
torch.cuda.empty_cache()

# Import tokenizer & pre-trained DialoGPT model
dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium").to(device)

# Set the padding token to be EOS token
dialogpt_tokenizer.pad_token = dialogpt_tokenizer.eos_token
#dialogpt_model.config.pad_token_id = dialogpt_model.config.eos_token_id

In [16]:
# Evaluate performance of the model
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "conv")

Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [21:15<00:00,  1.28s/dialogues]


Mean BLEU-1 score: 0.03242453170678008
Mean BLEU-2 score: 0.00203300302785073
Mean BLEU-4 score: 0.0007096823609430663
Mean ROUGE-1 F-score: 0.10993791006540518
Mean ROUGE-2 F-score: 0.02592668508612644
Mean ROUGE-L F-score: 0.10397484538465324
Distinct-1 score: 0.1036715185748425
Distinct-2 score: 0.2910552061495458


In [17]:
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "sep")

Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [30:51<00:00,  1.85s/dialogues]


Mean BLEU-1 score: 0.0386720532867369
Mean BLEU-2 score: 0.001967099045135996
Mean BLEU-4 score: 0.0008031037805097597
Mean ROUGE-1 F-score: 0.1290583710952275
Mean ROUGE-2 F-score: 0.027180262753334586
Mean ROUGE-L F-score: 0.12064876074836307
Distinct-1 score: 0.08578091330401157
Distinct-2 score: 0.24273453292242436


In [134]:
# Try to get model-generated responses
print(chatbot(test_dialogue1, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue2, dialogpt_model, dialogpt_tokenizer))

I'm not sure what you mean by help.
I'm not sure what you mean by help.


In [135]:
print(chatbot(test_dialogue3, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue4, dialogpt_model, dialogpt_tokenizer))

I'm not your young man, young man.
I'm not your young man, young man.


### Fine-tune pre-trained model with DailyDialogue

In [18]:
# Function to tokenize dataset into a LIST of dialogue tensors with tokenized ids
def tokenize_dataset(dataset, max_conv_len, dialogpt_tokenizer):
    output_list = []
    for dialogue in tqdm(dataset, desc="Tokenizing and flattening dialogues", unit="dialogues"):
        tokenized_conv = tokenize_flatten_conv(dialogue, dialogpt_tokenizer) # Tokenize & flatten
        tokenized_conv = torch.tensor(tokenized_conv)                        # Turn tokenized output into tensor
        
        # Truncate the tokenized output tensor if it exceeds the maximum conversation length
        if len(tokenized_conv) > max_conv_len:
            tokenized_conv = tokenized_conv[:max_conv_len]
            
        output_list.append(tokenized_conv)  # Save the (truncated) conversation into a list
    return output_list


In [19]:
# Collate function to pad the conversation to maximum conversation length
def collate(conv):
    
    ## NOTE: dialogpt_tokenizer referenced on the cell above during model loading
    return pad_sequence(conv, batch_first=True, padding_value=dialogpt_tokenizer.pad_token_id)


#### Train data

In [40]:
# Model save path
model_output_dir = './models/'

# Function to fine-tune a DialoGPT model
def fine_tune_dialogpt(dialogpt_model, dialogpt_tokenizer):
    
    n_epochs = 20
    max_conv_len = 128
    batch_size = 8
    best_perplexity = 100
    early_stop = 3
    no_improve_count = 0

    # Tokenize training & validation datasets
    train_data_list = tokenize_dataset(train_dataset['dialog'], max_conv_len, dialogpt_tokenizer)
    val_data_list = tokenize_dataset(val_dataset['dialog'], max_conv_len, dialogpt_tokenizer)

    # Create DataLoaders for training & validation datasets
    train_dataloader = DataLoader(train_data_list, batch_size=batch_size, shuffle=True, collate_fn=collate, drop_last=True)
    val_dataloader = DataLoader(val_data_list, batch_size=8, shuffle=False, collate_fn=collate, drop_last=False)

    # Define optimizer & scheduler
    optimizer = torch.optim.AdamW(dialogpt_model.parameters(), lr=5e-5)
    total_num_of_steps = len(train_dataloader)*n_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, total_num_of_steps // 10, total_num_of_steps)

    # Loop through each epoch
    for n in range(n_epochs):
    
        # Initialize training phase
        train_loss = 0.0
        eval_loss = 0.0
        dataloader_len = len(train_dataloader)
        print(f"Running epoch {n+1}...")
        dialogpt_model.train()
    
        # Training phase
        for batch in tqdm(train_dataloader, desc="Fine-tuning the model", unit="batch"):
        
            # Pass the inputs and labels to device
            inputs, labels = (batch, batch)
            inputs = inputs.to(device)
            labels = inputs.to(device)
        
            # Pass to the model and get the loss
            outputs = dialogpt_model(inputs, labels=labels)
            loss = outputs[0]  # the output is a tuple, the first element is the loss
        
            # Backpropagation
            dialogpt_model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
        
            # Save training loss
            train_loss += loss.item()
        
        # Calculate mean training loss
        avg_train_loss = train_loss / dataloader_len
        print(f"Train_loss: {avg_train_loss}")
    
        # Evaluation phase
        dialogpt_model.eval()
        for batch in tqdm(val_dataloader, desc="Evaluating the model", unit="batch"):
        
            # Pass the inputs and labels to device
            inputs, labels = (batch, batch)
            inputs = inputs.to(device)
            labels = inputs.to(device)
        
            # Pass to the model and get the loss
            with torch.no_grad():
                outputs = dialogpt_model(inputs, labels=labels)
                eval_loss += outputs[0].item()
    
        # Calculate perplexity
        avg_eval_loss = eval_loss / len(val_dataloader)
        perplexity = torch.exp(torch.tensor(avg_eval_loss))
        print(f"Perplexity: {perplexity}")
    
        # Save the model if it is the current best one
        if perplexity < best_perplexity:
            best_perplexity = perplexity
            dialogpt_model.save_pretrained(model_output_dir)  # Save the model
            print(f"Saved best model.    Epoch: {n+1}    Perplexity: {perplexity}")
            no_improve_count = 0
        else:
            no_improve_count += 1
    
        # Early stop
        if no_improve_count == early_stop:
            print("Stopping training since eval score didn't improve for {early_stop} epochs.")
            break

#### Fine-tune the model & evaluate the fine-tuned model on evaluation metrics on test data

In [21]:
# Free memory
try:
    del dialogpt_model
    del dialogpt_tokenizer
except NameError:
    pass
torch.cuda.empty_cache()

# Load tokenizer and initialize the EOS token as PAD token
dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
dialogpt_tokenizer.pad_token = dialogpt_tokenizer.eos_token

# Import pre-trained DialoGPT model & define PAD token
dialogpt_model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small").to(device)
#dialogpt_model.config.pad_token_id = dialogpt_model.config.eos_token_id

# Run the training function
fine_tune_dialogpt(dialogpt_model, dialogpt_tokenizer)

Tokenizing and flattening dialogues: 100%|███████████████████████████████| 11118/11118 [00:12<00:00, 892.12dialogues/s]
Tokenizing and flattening dialogues: 100%|█████████████████████████████████| 1000/1000 [00:01<00:00, 821.32dialogues/s]


Running epoch 1...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:36<00:00,  5.02batch/s]


Train_loss: 2.6811932248979957


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 20.14batch/s]


Perplexity: 7.937488079071045
Saved best model.    Epoch: 1    Perplexity: 7.937488079071045
Running epoch 2...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:36<00:00,  5.03batch/s]


Train_loss: 2.005671069637488


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 20.48batch/s]


Perplexity: 6.8491692543029785
Saved best model.    Epoch: 2    Perplexity: 6.8491692543029785
Running epoch 3...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:42<00:00,  4.92batch/s]


Train_loss: 1.8424023593218743


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 19.61batch/s]


Perplexity: 6.379498481750488
Saved best model.    Epoch: 3    Perplexity: 6.379498481750488
Running epoch 4...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:47<00:00,  4.83batch/s]


Train_loss: 1.718144028327549


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 19.03batch/s]


Perplexity: 6.116057872772217
Saved best model.    Epoch: 4    Perplexity: 6.116057872772217
Running epoch 5...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:42<00:00,  4.92batch/s]


Train_loss: 1.6138428967577507


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:05<00:00, 21.42batch/s]


Perplexity: 5.9696044921875
Saved best model.    Epoch: 5    Perplexity: 5.9696044921875
Running epoch 6...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:22<00:00,  5.30batch/s]


Train_loss: 1.5229063176096043


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:05<00:00, 21.22batch/s]


Perplexity: 5.8800048828125
Saved best model.    Epoch: 6    Perplexity: 5.8800048828125
Running epoch 7...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:37<00:00,  5.00batch/s]


Train_loss: 1.441306722902239


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:05<00:00, 21.40batch/s]


Perplexity: 5.8708295822143555
Saved best model.    Epoch: 7    Perplexity: 5.8708295822143555
Running epoch 8...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:28<00:00,  5.18batch/s]


Train_loss: 1.367401417551322


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 20.35batch/s]


Perplexity: 5.81952428817749
Saved best model.    Epoch: 8    Perplexity: 5.81952428817749
Running epoch 9...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:46<00:00,  4.85batch/s]


Train_loss: 1.3014992149379119


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 18.92batch/s]


Perplexity: 5.88892126083374
Running epoch 10...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:49<00:00,  4.81batch/s]


Train_loss: 1.2434352300766134


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 18.88batch/s]


Perplexity: 5.870273590087891
Running epoch 11...


Fine-tuning the model: 100%|████████████████████████████████████████████████████| 1389/1389 [04:47<00:00,  4.83batch/s]


Train_loss: 1.1918842951285058


Evaluating the model: 100%|███████████████████████████████████████████████████████| 125/125 [00:06<00:00, 19.43batch/s]

Perplexity: 5.91548490524292
Stopping training since eval score didn't improve for {early_stop} epochs.





In [136]:
# Free memory
try:
    del dialogpt_model
except NameError:
    pass
    
torch.cuda.empty_cache()

# Load best saved model
dialogpt_model = AutoModelForCausalLM.from_pretrained(model_output_dir).to(device)

In [23]:
# Evaluate performance of the model
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "conv")

Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [20:50<00:00,  1.25s/dialogues]


Mean BLEU-1 score: 0.04890141882678121
Mean BLEU-2 score: 0.008890029526589923
Mean BLEU-4 score: 0.0038604564463968903
Mean ROUGE-1 F-score: 0.1435712952922434
Mean ROUGE-2 F-score: 0.048553571879876384
Mean ROUGE-L F-score: 0.1367040996430867
Distinct-1 score: 0.0875288328463786
Distinct-2 score: 0.2431566882416397


In [24]:
eval_model_on_metrics(dialogpt_model, dialogpt_tokenizer, test_dataset['dialog'], "sep")

Evaluation of model: 100%|██████████████████████████████████████████████████| 1000/1000 [17:52<00:00,  1.07s/dialogues]


Mean BLEU-1 score: 0.053865721621866126
Mean BLEU-2 score: 0.006653280556751681
Mean BLEU-4 score: 0.00227100247652534
Mean ROUGE-1 F-score: 0.1649563729158313
Mean ROUGE-2 F-score: 0.05005987018899637
Mean ROUGE-L F-score: 0.15516420030494096
Distinct-1 score: 0.10102463932010555
Distinct-2 score: 0.30718851719675283


In [137]:
# Try to get model-generated responses
print(chatbot(test_dialogue1, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue2, dialogpt_model, dialogpt_tokenizer))

 Yes, I'd like to buy a new shirt. 
 Yes, I'd like to buy a new shirt. 


In [138]:
print(chatbot(test_dialogue3, dialogpt_model, dialogpt_tokenizer))
print(chatbot(test_dialogue4, dialogpt_model, dialogpt_tokenizer))

 Well, first of all, it was a lot of work. 
 Well, first of all, it was a lot of work. 
