In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!pip install transformers

In [1]:
import pandas as pd
import numpy as np
import json
import spacy
import ast
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import os
from collections import defaultdict
from scipy.stats import entropy
from scipy.spatial.distance import jensenshannon
from transformers.utils import logging
logging.set_verbosity_error()  # Suppress warnings from transformers library

  from .autonotebook import tqdm as notebook_tqdm


### Load Dataset

In [2]:
with open("tagged_sentences.pkl","rb") as f:
    tagged_sentences = pickle.load(f)

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', device="cuda")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Resize model embeddings to accommodate the new token
model = GPT2LMHeadModel.from_pretrained('gpt2').to("cuda")
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [5]:
def tag_sentence(text):
    '''
    Retrieve part of speech tags for a sentence of text
    '''
    doc = nlp(text)
    return [(token.text, token.pos_, idx) for idx, token in enumerate(doc)]

In [6]:
def extract_ngrams_from_sentence(sent_idx, ngram_size, tagged_sentence):
    '''
    Extract a list of ngrams from a tagged sentence
    '''
    results = []
    if len(tagged_sentence) < ngram_size:
       return results
    
    for i in range(len(tagged_sentence) - (ngram_size -1)):
       extracted_ngram = tagged_sentence[i:i+ngram_size]
       results.append((sent_idx, i, extracted_ngram))
    return results

In [7]:
def get_next_word_pos(sentence, next_word, ngram_position, ngram_size):
  '''
  Get the part of speech tag for the next word in a sentence
  '''
  sentence_tokens = [word for word, pos, _ in sentence]
  extended_sentence = sentence_tokens[:ngram_position+ngram_size] + [next_word]

  tagged = tag_sentence(" ".join(extended_sentence))
  return tagged[-1][1]

In [8]:
def extract_ngram_ground_truth_pairings(ngram_size, tagged_sentences):
    '''
    Extract ngrams from a list of tagged sentences
    '''
    ngram_pairs = []
    for sent_idx, tagged_sentence in enumerate(tagged_sentences):
       if len(tagged_sentence) < ngram_size:
          continue

       for i in range(len(tagged_sentence) - (ngram_size -1)):
          extracted_ngram = tagged_sentence[i:i+ngram_size]
          next_word = tagged_sentence[i + ngram_size] if (i + ngram_size) < len(tagged_sentence) else None
          ngram_pairs.append((extracted_ngram, next_word))
    return ngram_pairs

In [9]:
def compute_kl_divergence(real_matrix, generated_matrix, epsilon=1e-10):
   '''
   Compute KL divergence between two matrices
   '''
   kl_scores = {}

   for ngram in real_matrix.index:
      if ngram in generated_matrix.index:
         real_probs = real_matrix.loc[ngram].values
         gen_probs = generated_matrix.loc[ngram].values

         real_sum = real_probs.sum()
         gen_sum = gen_probs.sum()

         if real_sum == 0 or gen_sum == 0:
            kl_scores[ngram] = 0.0
            continue

         real_probs = real_probs / real_sum
         gen_probs = gen_probs / gen_sum

         gen_probs = np.clip(gen_probs, epsilon, 1)

         kl_scores[ngram] = entropy(real_probs, gen_probs)
   return kl_scores

In [10]:
def compute_js_divergence(real_matrix, generated_matrix, epsilon=1e-10):
    """
    Compute Jensen-Shannon divergence between real and generated POS probability matrices.
    """
    js_scores = {}

    for ngram in real_matrix.index:
        if ngram in generated_matrix.index:
            real_probs = real_matrix.loc[ngram].values
            gen_probs = generated_matrix.loc[ngram].values

            real_sum = real_probs.sum()
            gen_sum = gen_probs.sum()

            if real_sum == 0 or gen_sum == 0:
              js_scores[ngram] = 0.0
              continue


            real_probs = real_probs / real_sum
            gen_probs = gen_probs / gen_sum

            gen_probs = np.clip(gen_probs, epsilon, 1)

            js_scores[ngram] = jensenshannon(real_probs, gen_probs)

    return js_scores

In [11]:
# Define training and testing sets
train_size = int(len(tagged_sentences)*0.8)
train_sentences = tagged_sentences[:train_size]
test_sentences = tagged_sentences[train_size:]

In [12]:
len(tagged_sentences)

10000

In [17]:
# Define the ngram size
#ngram_size = 3
#ngram_size = 4
ngram_size = 5

In [18]:
# Extract ngrams from the training set
text_ngrams = []
for ix, tagged_sentence in enumerate(train_sentences):
  text_ngrams.extend(extract_ngrams_from_sentence(ix, ngram_size, tagged_sentence))

In [19]:
# Create a list of strings from the ngrams
# This will be used for the model to generate text
text_segments = []
for ngram in text_ngrams:
    text_segments.append(" ".join([token[0] for token in ngram[2]]))

In [20]:
len(text_segments)

161289

In [30]:
# Generate the next word for each ngram
# This will be used to create the probability matrix
text_encodings = tokenizer(text_segments, truncation=True, padding=True,return_tensors="pt")
input_ids = text_encodings['input_ids']
attention_mask = text_encodings['attention_mask']
batch_size = 64

generated_outputs = []

for i in range(0, len(input_ids), batch_size):
  batch_input_ids = input_ids[i:i+batch_size].to("cuda")
  batch_attention_mask = attention_mask[i:i+batch_size].to("cuda")

  with torch.no_grad():
      outputs = model.generate(
          input_ids=batch_input_ids,
          attention_mask=batch_attention_mask,
          max_new_tokens=1,
          temperature=0,
          do_sample=False,
          return_dict_in_generate=True,
          output_scores=True,
          pad_token_id=tokenizer.pad_token_id
      )
      generated_outputs.append(outputs.sequences.detach().cpu())
      del outputs
      torch.cuda.empty_cache()

generated_outputs = torch.cat(generated_outputs, dim=0)
next_word_tokens = tokenizer.batch_decode(generated_outputs[:,-1], skip_special_tokens=True)



In [None]:
# Build transition matrices
def dd():
    return defaultdict(int)
transition_matrix = defaultdict(dd)
gt_transition_matrix = defaultdict(dd)

In [None]:
# Populate the transition matrix with counts
for ngram, next_word in zip(text_ngrams, next_word_tokens):
    sent_idx = ngram[0]
    ngram_position = ngram[1]
    setence = train_sentences[sent_idx]
    next_word_pos = get_next_word_pos(setence, next_word, ngram_position, ngram_size)
    pos_ngram = "->".join([token[1] for token in ngram[2]])
    transition_matrix[pos_ngram][next_word_pos] += 1

ngrams_with_next = extract_ngram_ground_truth_pairings(ngram_size, train_sentences)
for ngram, next_word in ngrams_with_next:
  pos_ngram = "->".join([token[1] for token in ngram])
  next_word_pos = next_word[1] if next_word else 'NONE'
  gt_transition_matrix[pos_ngram][next_word_pos] += 1

del batch_input_ids, batch_attention_mask
  
for ngram, pos_counts in transition_matrix.items():
  total = sum(pos_counts.values())
  for pos, count in pos_counts.items():
    transition_matrix[ngram][pos] = count / total

for ngram, pos_counts in gt_transition_matrix.items():
  total = sum(pos_counts.values())
  for pos, count in pos_counts.items():
      gt_transition_matrix[ngram][pos] = count / total

transition_df = pd.DataFrame(transition_matrix).transpose().fillna(0)
gt_transition_df = pd.DataFrame(gt_transition_matrix).transpose().fillna(0)

gt_transition_df['SPACE'] = 0.0
transition_df['NONE'] = 0.0


In [33]:
columns = sorted(gt_transition_df.columns)
kl_scores = compute_kl_divergence(gt_transition_df[columns], transition_df[columns])
js_scores = compute_js_divergence(gt_transition_df[columns], transition_df[columns])

In [34]:
test_transition_matrix = defaultdict(dd)
test_ngrams_with_next = extract_ngram_ground_truth_pairings(ngram_size, test_sentences)
for ngram, next_word in test_ngrams_with_next:
  pos_ngram = "->".join([token[1] for token in ngram])
  next_word_pos = next_word[1] if next_word else 'NONE'
  test_transition_matrix[pos_ngram][next_word_pos] += 1

for ngram, pos_counts in test_transition_matrix.items():
  total = sum(pos_counts.values())
  for pos, count in pos_counts.items():
      test_transition_matrix[ngram][pos] = count / total

test_transition_df = pd.DataFrame(test_transition_matrix).transpose().fillna(0)
test_transition_df['SPACE'] = 0.0

In [35]:
kl_scores = compute_kl_divergence(test_transition_df[columns], transition_df[columns])
js_scores = compute_js_divergence(test_transition_df[columns], transition_df[columns])

In [36]:
#transition_df.to_csv("transition_df.csv")
#gt_transition_df.to_csv("gt_transition_df.csv")
#test_transition_df.to_csv("test_transition_df.csv")
#transition_df.to_csv("n4_transition_df.csv")
#gt_transition_df.to_csv("n4_gt_transition_df.csv")
#test_transition_df.to_csv("n4_test_transition_df.csv")
transition_df.to_csv("n5_transition_df.csv")
gt_transition_df.to_csv("n5_gt_transition_df.csv")
test_transition_df.to_csv("n5_test_transition_df.csv")

In [37]:
#transition_df = pd.read_csv("transition_df.csv").set_index("Unnamed: 0")
#gt_transition_df = pd.read_csv("gt_transition_df.csv").set_index("Unnamed: 0")
#test_transition_df = pd.read_csv("test_transition_df.csv").set_index("Unnamed: 0")
#transition_df = pd.read_csv("n4_transition_df.csv").set_index("Unnamed: 0")
#gt_transition_df = pd.read_csv("n4_gt_transition_df.csv").set_index("Unnamed: 0")
#test_transition_df = pd.read_csv("n4_test_transition_df.csv").set_index("Unnamed: 0")
transition_df = pd.read_csv("n5_transition_df.csv").set_index("Unnamed: 0")
gt_transition_df = pd.read_csv("n5_gt_transition_df.csv").set_index("Unnamed: 0")
test_transition_df = pd.read_csv("n5_test_transition_df.csv").set_index("Unnamed: 0")

In [38]:
kl_scores = compute_kl_divergence(test_transition_df, transition_df)
js_scores = compute_js_divergence(test_transition_df, transition_df)

In [39]:
kl_scores_srs = pd.Series(kl_scores)
js_scores_srs = pd.Series(js_scores)

In [40]:
#kl_scores_srs.to_csv("kl_scores.csv")
#js_scores_srs.to_csv("js_scores.csv")
#kl_scores_srs.to_csv("n4_kl_scores.csv")
#js_scores_srs.to_csv("n4_js_scores.csv")
kl_scores_srs.to_csv("n5_kl_scores.csv")
js_scores_srs.to_csv("n5_js_scores.csv")

In [41]:
kl_scores = compute_kl_divergence(gt_transition_df, transition_df)
js_scores = compute_js_divergence(gt_transition_df, transition_df)

kl_scores_srs = pd.Series(kl_scores)
js_scores_srs = pd.Series(js_scores)

#kl_scores_srs.to_csv("gt_kl_scores.csv")
#js_scores_srs.to_csv("gt_js_scores.csv")
#kl_scores_srs.to_csv("n4_gt_kl_scores.csv")
#js_scores_srs.to_csv("n4_gt_js_scores.csv")
kl_scores_srs.to_csv("n5_gt_kl_scores.csv")
js_scores_srs.to_csv("n5_gt_js_scores.csv")

## POS TAG Meanings
* ADJ: Adjective
* ADP: Adposition
* ADV: Adverb
* AUX: Auxiliary
* CONJ: Conjunction
* CCONJ: Coordinating conjunction
* DET: Determiner
* INTJ: Interjection
* NOUN: Noun
* NUM: Numeral
* PART: Particle
* PRON: Pronoun
* PUNCT: Punctuation
* SCONJ: Subordinating conjunction
* SPACE: Space
* SYM: Symbol
* VERB: Verb
* X: Other
* EOL: End of line