In [1]:
from nltk.corpus import brown
from nltk.lm import MLE, Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import os
import csv
import copy
import random

from transformers import pipeline
from rouge_score import rouge_scorer

In [2]:
random.seed(42)

## Preparing the data

In [3]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [4]:
genre = 'science_fiction'
sentences = brown.sents(categories=genre)

In [5]:
print(list(sentences[0]))

['Now', 'that', 'he', 'knew', 'himself', 'to', 'be', 'self', 'he', 'was', 'free', 'to', 'grok', 'ever', 'closer', 'to', 'his', 'brothers', ',', 'merge', 'without', 'let', '.']


In [6]:
sentences = [[word.lower() for word in sentence] for sentence in sentences]

In [7]:
print(list(sentences[0]))

['now', 'that', 'he', 'knew', 'himself', 'to', 'be', 'self', 'he', 'was', 'free', 'to', 'grok', 'ever', 'closer', 'to', 'his', 'brothers', ',', 'merge', 'without', 'let', '.']


In [8]:
total_length = 0
i = 0

while total_length < 350:
    total_length += len(sentences[i])
    i += 1

train_data = sentences[i:]
test_data = sentences[:i]

print(f'Total amounts of tokens in the test set: {total_length}')

Total amounts of tokens in the test set: 354


In [9]:
def mask_sentence(sentence):
    if len(sentence) < 1:
        return sentence
    
    masked_sentence = sentence.copy()
    mask_idx = random.randint(0, len(masked_sentence) - 1)
    masked_sentence[mask_idx] = '[MASK]'
    
    return masked_sentence

**DISCLAIMER**\
The exercise states that every 7th word should be masked. However, distilBERT only masks one word per sentence.\
In order to compare the results of BLEU/ROUGE more accurately, we have changed the code to mask only one word for the ngram as well.

In [10]:
masked_data = copy.deepcopy(test_data)

for i, sentence in enumerate(test_data):
    masked_sent = mask_sentence(sentence)
    masked_data[i] = masked_sent

In [11]:
folder_name = 'data'

if not os.path.exists(folder_name):
      os.makedirs(folder_name)

csv_file_path = os.path.join(folder_name, 'masked_sentences.csv')

def save_masked_sents_to_csv(masked_sents, file_path):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            for sentence in masked_sents:
                writer.writerow(sentence)

save_masked_sents_to_csv(masked_data, csv_file_path)

## Training

In [12]:
n_grams = 3
train_set_lm, vocab = padded_everygram_pipeline(n_grams, train_data)

In [13]:
lm = MLE(n_grams)
print(f'Length of vocab before training: {len(lm.vocab)}')
lm.fit(train_set_lm, vocab)
print(f'Length of vocab after training: {len(lm.vocab)}')

Length of vocab before training: 0
Length of vocab after training: 2993


In [14]:
lm.vocab.lookup(train_data[0])

('in',
 'his',
 'mind',
 'he',
 'spoke',
 'simultaneously',
 'the',
 'english',
 'sentence',
 'and',
 'the',
 'martian',
 'word',
 'and',
 'felt',
 'closer',
 'grokking',
 '.')

In [15]:
lm.vocab.lookup(['stop', 'it', 'elmo'])

('<UNK>', 'it', '<UNK>')

## Usage

In [16]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 50781 ngrams>


In [17]:
round(lm.score('mars', ['from']), 4)

0.0217

In [18]:
round(lm.logscore('mars', ['from']), 4)

-5.5236

In [19]:
lm.generate(10, random_seed=42)

['lady', "''", '?', '?', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>']

In [20]:
def ngrams_predictions(model, masked_sents):
    predicted_sents = []

    for sentence in masked_sents:
        modified_sentence = []

        for word in sentence:
            if word == '[MASK]':
                context = tuple(modified_sentence[-(model.order-1):])
                predicted_word = model.generate(1, context)
                modified_sentence.append(predicted_word)
            else:
                modified_sentence.append(word)

        predicted_sents.append(modified_sentence)

    return predicted_sents

In [21]:
def compare_sentences(masked_sents, pred_sents, original_sents):
    for i in range(2):
        print(f'Masked sentence:\n{' '.join(masked_sents[i])}')
        print(f'Predicted sentence:\n{' '.join(pred_sents[i])}')
        print(f'Original sentence:\n{' '.join(original_sents[i])}')
        print()

In [22]:
pred_sents_ngram = ngrams_predictions(lm, masked_data)
compare_sentences(masked_data, pred_sents_ngram, test_data)

Masked sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge [MASK] let .
Predicted sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge out let .
Original sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .

Masked sentence:
self's [MASK] was and is and ever had been .
Predicted sentence:
self's '' was and is and ever had been .
Original sentence:
self's integrity was and is and ever had been .



## Distilbert

In [23]:
model = pipeline('fill-mask', model='distilbert-base-uncased')

Device set to use mps:0


In [24]:
def bert_predictions(model, masked_data):
    predicted_sents = []

    for sentence in masked_data:        
        sentence_str = ' '.join(sentence)
        
        try:
            output = model(sentence_str)
            y_pred = output[0]['sequence']

            predicted_sents.append(y_pred.split())
        
        except Exception as e:
            print(f'Error processing sentence: {e}\n')
    
    return predicted_sents

In [25]:
pred_sents_bert = bert_predictions(model, masked_data)
compare_sentences(masked_data, pred_sents_bert, test_data)

Masked sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge [MASK] let .
Predicted sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers, merge and let.
Original sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .

Masked sentence:
self's [MASK] was and is and ever had been .
Predicted sentence:
self ' s heart was and is and ever had been.
Original sentence:
self's integrity was and is and ever had been .



## Human evaluation

In [26]:
def load_and_clean_predictions(csv_file):
    pred_sents_friend = []

    with open(csv_file, mode='r', encoding='utf-8-sig') as file:
        reader = csv.reader(file)
        for row in reader:
            cleaned_row = [word for word in row if word.strip() != '']
            pred_sents_friend.append(cleaned_row)
    
    return pred_sents_friend

pred_sents_friend = load_and_clean_predictions('data/pred_sents_friend.csv')
compare_sentences(masked_data, pred_sents_friend, test_data)

Masked sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge [MASK] let .
Predicted sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge is let .
Original sentence:
now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .

Masked sentence:
self's [MASK] was and is and ever had been .
Predicted sentence:
self's always was and is and ever had been .
Original sentence:
self's integrity was and is and ever had been .



## Results

In [27]:
def compute_bleu_score(predicted_sents, original_sents):
    scores = []

    for pred, ref in zip(predicted_sents, original_sents):
        score = sentence_bleu([ref], pred, smoothing_function=smoothie)
        scores.append(score)
    
    return sum(scores) / len(scores)

smoothie = SmoothingFunction().method4

In [28]:
def compute_rouge_scores(predicted_sents, original_sents):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"])

    rouge1_score = 0
    rouge2_score = 0
    rougeL_score = 0

    for pred, ref in zip(predicted_sents, original_sents):
        score = scorer.score(' '.join(ref), ' '.join(pred))

        rouge1_score += score['rouge1'].fmeasure
        rouge2_score += score['rouge2'].fmeasure
        rougeL_score += score['rougeL'].fmeasure
        
    num_sents = len(original_sents)

    rouge1_score /= num_sents
    rouge2_score /= num_sents
    rougeL_score /= num_sents
    
    return [rouge1_score, rouge2_score, rougeL_score]

In [29]:
bleu_score_ngram = compute_bleu_score(pred_sents_ngram, masked_data)
bleu_score_bert = compute_bleu_score(pred_sents_bert, masked_data)
bleu_score_friend = compute_bleu_score(pred_sents_friend, masked_data)

rouge_scores_ngram = compute_rouge_scores(pred_sents_ngram, masked_data)
rouge_scores_bert = compute_rouge_scores(pred_sents_bert, masked_data)
rouge_scores_friend = compute_rouge_scores(pred_sents_friend, masked_data)

In [30]:
print(f'Average BLEU scores')
print('---------------------')
print(f'BLEU\nngram: {bleu_score_ngram:.4f}\nbert: {bleu_score_bert:.4f}\nhuman: {bleu_score_friend:.4f}')
print()
print(f'Average ROUGE scores')
print('---------------------')
print(f'ROUGE-1\nngram: {rouge_scores_ngram[0]:.4f}\nbert: {rouge_scores_bert[0]:.4f}\nhuman: {rouge_scores_friend[0]:.4f}')
print(f'ROUGE-2\nngram: {rouge_scores_ngram[1]:.4f}\nbert: {rouge_scores_bert[1]:.4f}\nhuman: {rouge_scores_friend[1]:.4f}')
print(f'ROUGE-L\nngram: {rouge_scores_ngram[2]:.4f}\nbert: {rouge_scores_bert[2]:.4f}\nhuman: {rouge_scores_friend[2]:.4f}')

Average BLEU scores
---------------------
BLEU
ngram: 0.8704
bert: 0.5794
human: 0.8704

Average ROUGE scores
---------------------
ROUGE-1
ngram: 0.9511
bert: 0.9417
human: 0.9244
ROUGE-2
ngram: 0.9049
bert: 0.8957
human: 0.8744
ROUGE-L
ngram: 0.9511
bert: 0.9417
human: 0.9244
