In [None]:
import spacy
from transformers import pipeline
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import math

nlp = spacy.load("en_core_web_sm")
per_model_name = 'gpt2'
per_model = GPT2LMHeadModel.from_pretrained(per_model_name)
per_tokenizer = GPT2Tokenizer.from_pretrained(per_model_name)

cos_model = SentenceTransformer('all-MiniLM-L6-v2')

def evaluate_sentence(sen_new,sen_ori,word):
    ori_doc = nlp(sen_ori)
    new_doc = nlp(sen_new)

    ori_pos = [token.pos_ for token in ori_doc]
    new_pos = [token.pos_ for token in new_doc if token.text != word]
    
    length_to_compare = min(len(ori_pos), len(new_pos))
    
    unchanged_pos_count = 0
    for i in range(length_to_compare):
        if ori_pos[i] == new_pos[i]:
            unchanged_pos_count += 1

    unchanged_pos_ratio = unchanged_pos_count / length_to_compare

    sentence = sen_new
    inputs = per_tokenizer(sentence, return_tensors="pt")

    with torch.no_grad():
        outputs = per_model(**inputs, labels=inputs["input_ids"])
        log_likelihood = outputs.loss * inputs["input_ids"].shape[1]
    sentence_length = inputs["input_ids"].shape[1]

    perplexity = torch.exp(log_likelihood/sentence_length).item()
    alpha = 0.01
    normalized_perplexity = math.log(perplexity + 1)
    perplexity_score = 1 / (1 + alpha * normalized_perplexity)

    original_embedding = cos_model.encode(sen_ori, convert_to_tensor=True)
    new_embedding = cos_model.encode(sen_new, convert_to_tensor=True)
    cosine_similarity = util.pytorch_cos_sim(original_embedding, new_embedding)

    total_score = (unchanged_pos_ratio + perplexity_score + cosine_similarity.item() )
    
    return total_score

fill_mask = pipeline("fill-mask", model="bert-base-uncased")

from nltk.corpus import words
import nltk

nltk.download('words')
english_words = set(words.words())

def is_english_word(word):
    return word.lower() in english_words

def insert_appropriate_word(sentence, mask_token="[MASK]"):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    sen_list = []
    word_list = []
   
    for token in doc:
        new_sentence = sentence
        mask_position = token.idx + len(token) + 1

        masked_sentence = new_sentence[:mask_position] + mask_token + " " + new_sentence[mask_position:]
      
        predictions = fill_mask(masked_sentence)
        insert_word = predictions[0]['token_str']
        if insert_word in new_sentence:
            continue
        if not is_english_word(insert_word):
            continue
        new_sentence = masked_sentence.replace(mask_token, insert_word, 1)
        mask_position += len(insert_word) + 1
        sen_list.append(new_sentence)
        word_list.append(insert_word)

    score_list = []
    if len(sen_list )> 0:   
        for i in range(len(sen_list)):
            score = evaluate_sentence(sen_list[i],sentence,word_list[i])
            score_list.append(score)
        
        max_index = score_list.index(max(score_list))
        print("max_score: ",max(score_list))
        return sen_list[max_index],word_list[max_index]
    else: 
        print("Can not insert a word !!!")
        return sentence