In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamWeightDecay, pipeline, create_optimizer, Trainer, TrainingArguments
from transformers import DefaultDataCollator
from datasets import load_metric
from sklearn.model_selection import train_test_split
import tensorflow as tf
import torch
from datasets import Dataset
import plotly.express as px
import plotly.io as pio
import pandas as pd
import math
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pio.renderers.default = 'notebook_connected'
from wordcloud import WordCloud
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import matplotlib.pyplot as plt
%matplotlib inline
# Flag to let us know if we're currently running in Google Colab or locally
import sys
IN_COLAB = 'google.colab' in sys.modules

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faculty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
tokenizer_auto = AutoTokenizer.from_pretrained("gpt2")
tokenizer_auto.pad_token = tokenizer_auto.eos_token
auto_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer_auto.eos_token_id)

In [44]:
def tokenization(data):
    tokens = tokenizer_auto(data["sentences"], padding="max_length", truncation=True, max_length=70)
    return tokens

In [39]:
def create_labels(text):
    text["labels"] = text["input_ids"].copy()
    return text

In [40]:
data_arcane= pd.read_csv('/project/nlp_text_generation_sentences_data_arcane_champions.csv')

In [41]:
data_arcane_train_val, test_set_arcane = train_test_split(data_arcane, test_size=0.2, random_state=200)

data_arcane_train_val_set = Dataset.from_pandas(data_arcane_train_val)
data_arcane_train_val_set = data_arcane_train_val_set.train_test_split(shuffle = True, seed = 200, test_size=0.25)
train_arcane = data_arcane_train_val_set["train"]
val_arcane = data_arcane_train_val_set["test"]

In [42]:
sentences = [len(x.split()) for x in data_arcane_train_val["sentences"]]
px.histogram(sentences, nbins=400, marginal="rug", labels={"value":"Sentences Length (words)"})

In [45]:
# Apply the tokenizer in batch mode and drop all the columns except the tokenization result
train_token_arcane = train_arcane.map(tokenization, batched = True, remove_columns=["Unnamed: 0"], num_proc=10)
val_token_arcane = val_arcane.map(tokenization, batched = True, remove_columns=["Unnamed: 0"], num_proc=10)

Map (num_proc=10):   0%|          | 0/282 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/95 [00:00<?, ? examples/s]

In [46]:
# Add the labels column using map()
lm_train_arcane = train_token_arcane.map(create_labels, batched=True, num_proc=10)
lm_val_arcane = val_token_arcane.map(create_labels, batched=True, num_proc=10)

Map (num_proc=10):   0%|          | 0/282 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/95 [00:00<?, ? examples/s]

In [None]:
# learning_rates = [0.001, 0.0001, 0.00005] #
# weight_decays = [0.01, 0.005, 0.001] #, 
# epochs= [1,2,3]
# best_result_arc= None
# best_loss_arc=  float('inf')

# for epoch in epochs:
#     for lr in learning_rates:
#         for wd in weight_decays:
#             auto_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
#             training_args_arc = TrainingArguments(output_dir=f'./results_arc', 
#                                               num_train_epochs= epoch, 
#                                               logging_strategy="epoch",
#                                               #logging_steps=20, 
#                                               save_strategy="epoch",
#                                               #save_steps=20,
#                                               evaluation_strategy='epoch',
#                                               #eval_steps=20,
#                                               learning_rate= lr ,
#                                               per_device_train_batch_size=1, 
#                                               per_device_eval_batch_size=1,
#                                               #warmup_steps=5, 
#                                               weight_decay=wd, 
#                                               logging_dir='./logs',
#                                               load_best_model_at_end=True)


#             trainer_arc = Trainer(model=auto_model,  
#                               args=training_args_arc, 
#                               train_dataset=lm_train_arcane, 
#                               eval_dataset=lm_val_arcane)
#             #                    compute_metrics=compute_metrics)


#             trainer_arc.train()   

#             result_arc = trainer_arc.evaluate(eval_dataset=lm_val_arcane)

#             # Check if the current result is the best so far
#             if result_arc['eval_loss'] < best_loss_arc:
#                 best_loss_arc = result_arc['eval_loss']
#                 best_result_arc = {'epoch': epoch, 'learning_rate': lr, 'weight_decay': wd, 'eval_loss': best_loss_arc}                
#                 trainer_arc.save_model(f'./best_model_arc')
#             #reset the model for the next loop of training     
# print('Best result:', best_result_arc)

In [47]:
best_result={'epoch': 2, 'learning_rate': 0.00005, 'weight_decay': 0.001, 'eval_loss': 1.278050}

In [None]:
# perplexity =math.exp(best_result_arc['eval_loss'])
# print(perplexity)

In [10]:
auto_model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
training_args_arc = TrainingArguments(output_dir=f'./results_arc', 
                                  num_train_epochs= 2, 
                                  logging_strategy="epoch",
                                  #logging_steps=20, 
                                  save_strategy="epoch",
                                  #save_steps=20,
                                  evaluation_strategy='epoch',
                                  #eval_steps=20,
                                  learning_rate= 0.00005 ,
                                  per_device_train_batch_size=1, 
                                  per_device_eval_batch_size=1,
                                  #warmup_steps=5, 
                                  weight_decay=0.001, 
                                  logging_dir='./logs',
                                  load_best_model_at_end=True)


trainer_arc = Trainer(model=auto_model,  
                  args=training_args_arc, 
                  train_dataset=lm_train_arcane, 
                  eval_dataset=lm_val_arcane)
#                    compute_metrics=compute_metrics)


trainer_arc.train()
trainer_arc.save_model(f'./best_model_arc')


distutils Version classes are deprecated. Use packaging.version instead.





Epoch,Training Loss,Validation Loss
1,1.538,1.27805
2,1.048,1.294168


In [11]:
result_arc = trainer_arc.evaluate(eval_dataset=lm_val_arcane)
perplexity =math.exp(result_arc['eval_loss'])
print(perplexity)

3.5896337731626167


In [48]:
best_model_arc = AutoModelForCausalLM.from_pretrained(f'./best_model_arc')

In [80]:
text_generator = pipeline(
    "text-generation",
    model=best_model_arc,
    tokenizer=tokenizer_auto,
    framework="pt",
    temperature=1.0,
    top_k=50,
    top_p=0.9,
    do_sample=True,
    min_length= 100,
    min_new_tokens=0, 
    max_new_tokens=100)

In [81]:
test_sentence = "A strong, lawful kingdom with a prestigious military history"
text_generator(test_sentence)[0]["generated_text"].replace("\n", " ")

'A strong, lawful kingdom with a prestigious military history and strong military reputation was quickly growing and increasingly unlikely to survive until something new was discovered to improve the system of their war. For some time the army had been on a course of great importance as it developed from a more expensive military automatons to a more effective defensive technology such as a powerful and versatile warhorn and a variety of powerful new technology to make them even more versatile and useful in battle! This trend culminated in a long string of scandals that led some of the most trusted and respected'

Error Analysis

BLEU for test set (examining the quality of the generated text)

In [87]:

arcane_text_generator_set = pipeline(
    "text-generation",
    model=best_model_arc,
    tokenizer=tokenizer_auto,
    framework="pt",
    temperature=1.0,
    top_k=50,
    top_p=0.9,
    do_sample=True,
    min_length= 40,
    max_length=200)

def generate_text_set(test_sentence):
    generated_text=arcane_text_generator_set(test_sentence)[0]["generated_text"].replace("\n", " ")
    generated_text = generated_text[len(test_sentence):].lstrip()
    return generated_text

In [88]:
test_sentence = "A strong, lawful kingdom with a prestigious military history"
generate_text_set(test_sentence)

'and a long history of alliances with allies all across the empire it became an even more powerful weapon than ever before when a rogue and powerful criminal mastermind sought the secrets of the city of Zaun'

In [89]:
test_set_arcane['label']= test_set_arcane.shift(-1)["sentences"]

In [90]:
test_set_arcane= test_set_arcane.dropna()

In [91]:
test_set_arcane['generated']= test_set_arcane['sentences'].apply(lambda x : generate_text_set(x))

In [92]:
print(test_set_arcane['generated'])

419    dripping from the temples and temples in Zaun ...
428    crystal and if he has plans to get one of his ...
115    as well as Jayce had seen his skills grow over...
287    % chance of it being hit with a baseball ball ...
237    but would not be taken for granted as a necess...
                             ...                        
47     but instead to save some of her young comrades...
245    and technology. During his time working with t...
323    though it might be better to use a single crys...
226    in order to enhance her strength and strength ...
264                                           from decay
Name: generated, Length: 93, dtype: object


In [117]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu
def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        references : a list of reference sentences 
        candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    ref_bleu = []
    gen_bleu = []
    for l in gen:
        gen_bleu.append(l.split())
    for i,l in enumerate(ref):
        ref_bleu.append([l.split()])
    cc = SmoothingFunction()
    score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=cc.method4)
    return score_bleu

In [118]:
BLEU_arcane= bleu(test_set＿arcane['label'].tolist(),test_set_arcane['generated'].tolist())

In [119]:
print(BLEU_arcane)

0.0028013918585460765


BLEU score on the new paragraph (holistic way)

In [10]:
arcane_para= """ 
Two sisters. Two cities. One discovery that will change the world forever. In the cities of Piltover and Zaun, unrest stirs as inventors and thieves, politicians and crime lords chafe against the constraints of a society torn asunder.
"""

In [11]:
target=""" 
As dissent reaches a fever pitch, two sisters steal an artifact of untold power. Discovery and danger intertwine as heroes are born and bonds are broken. Will this power change the world, or lead it to ruin? This is the world of Arcane. Nestled at the heart of the continent, Piltover stands on the precipice of progress. The invention of hextech shines with subtle promise of a glorious future, and two brilliant scientists lead the charge. Such potential is not without cost, and failure could prove devastating as politicians vie for a power they barely understand. As unrest stirs in the Undercity and upheaval looms, the people of Piltover must protect their future… no matter the cost. 
In the shadows of progress, far below the ivory towers of Piltover, the Undercity waits. The air there is thick with shouts and smoke, and each alleyway promises danger, desire...or both. But at its heart, this sister city thrums with hope, ingenuity, and an unbreakable spirit that neither greedy chembarons nor aggressive enforcers could ever take away. As the age of hextech looms, her citizens no longer see themselves as Topsiders. Instead, they look up with desperate determination and dream of something more.
"""

In [49]:
arcane_text_generator_para = pipeline(
    "text-generation",
    model=best_model_arc,
    tokenizer=tokenizer_auto,
    framework="pt",
    temperature=1.0,
    top_k=120,
    top_p=0.9,
    do_sample=True,
    min_length= len(target.split(" "))+len(arcane_para.split(" ")),
    min_new_tokens=0, 
    max_new_tokens=400)

In [50]:
def generate_text_para(test_sentence):
    generated_text=arcane_text_generator_para(test_sentence)[0]["generated_text"].replace("\n", " ")
    generated_text = generated_text[len(test_sentence):].lstrip()
    return generated_text

In [51]:
generated_arcane=generate_text_para(arcane_para)


You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation)



In [19]:
generated_arcane="""
Earning nothing is always a struggle and a new path that they know but will solve the problems from a better point of view a better one of Zaun itself. But in Piltover today the city itself is a battle for undertake a new path the first step in transforming mankind into a more agile and more versatile beast To the north the new academy Zaun has developed to help them navigate the city streets as they navigate the increasingly empty streets of Piltover and learn from them what it all means to be an engineer or a true lawyer and what it means to solve dilemmas the first time their efforts to learn from Piltovers parents parents could be futile at best. The cities of Piltover and Zaun have been in upheaval for a long time and the warring factions in Piltovers are yet to resolve their differences peacefully but soon something will change in Zaun and a new order of Zaunian Zaun will soon be at stake 
"""


In [20]:
print(generated_arcane)


Earning nothing is always a struggle and a new path that they know but will solve the problems from a better point of view a better one of Zaun itself. But in Piltover today the city itself is a battle for undertake a new path the first step in transforming mankind into a more agile and more versatile beast To the north the new academy Zaun has developed to help them navigate the city streets as they navigate the increasingly empty streets of Piltover and learn from them what it all means to be an engineer or a true lawyer and what it means to solve dilemmas the first time their efforts to learn from Piltovers parents parents could be futile at best. The cities of Piltover and Zaun have been in upheaval for a long time and the warring factions in Piltovers are yet to resolve their differences peacefully but soon something will change in Zaun and a new order of Zaunian Zaun will soon be at stake 



In [110]:
# Tokenize the paragraphs into lists of words
ref_tokens = nltk.word_tokenize(target.lower())
gen_tokens = nltk.word_tokenize(generated_arcane.lower())

# Calculate the BLEU score with 4-gram precision
bleu_score = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25, 0.25, 0.25, 0.25))

print("BLEU score:", bleu_score)

BLEU score: 3.731163506456045e-155


BERT SCORE

In [52]:
import numpy as np

In [53]:
def get_similarity(embedding_1:np.array, embedding_2:np.array) -> float:
  dot_product = np.dot(embedding_1, embedding_2)
  norm1 = np.linalg.norm(embedding_1)
  norm2 = np.linalg.norm(embedding_2)
  cosine_similarity = dot_product / (norm1 * norm2)
  return cosine_similarity  

In [54]:
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Encode the sentences using the BERT tokenizer
encoded_input_1 = tokenizer(target, padding=True, truncation=True, return_tensors='pt')

# Pass the encoded input through the BERT model to obtain the document embeddings
outputs = model(**encoded_input_1)

document_embeddings_1 = outputs.last_hidden_state[:, 0, :]
document_embeddings_1 = document_embeddings_1.detach().numpy()

# Encode the sentences using the BERT tokenizer
encoded_input_2 = tokenizer(generated_arcane, padding=True, truncation=True, return_tensors='pt')

# Pass the encoded input through the BERT model to obtain the document embeddings
outputs = model(**encoded_input_2)

document_embeddings_2 = outputs.last_hidden_state[:, 0, :]
document_embeddings_2 = document_embeddings_2.detach().numpy()
similarity_bert = get_similarity(document_embeddings_1[0], document_embeddings_2[0])
print(similarity_bert)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.83365035


Topic modeling comparing similarity

In [22]:
import gensim
from gensim import corpora, models, similarities
import numpy

In [35]:

# define the two text passages as a list of strings


# preprocess the text by tokenizing and removing stop words
texts = [gensim.utils.simple_preprocess(story) for story in [target, generated_arcane]]

# create a dictionary of unique terms in the text
dictionary = corpora.Dictionary(texts)

# convert the text into a bag-of-words representation
corpus = [dictionary.doc2bow(text) for text in texts]

# perform LDA topic modeling on the corpus
numpy.random.seed(58) 
lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=2)

# extract the topic distributions for each document
doc1_topics = lda_model.get_document_topics(corpus[0])
doc2_topics = lda_model.get_document_topics(corpus[1])

# compare the topic distributions using cosine similarity
similarity = similarities.MatrixSimilarity([doc1_topics])
cos_sim = similarity[doc2_topics][0]

# print the similarity score
print("Cosine similarity:", cos_sim)

Cosine similarity: 0.83918697
