In [None]:
!pip install torch==1.4.0
!pip install transformers==2.9.0
!pip install pytorch_lightning==0.7.5

## Run inference with any question as input

In [1]:
# https://github.com/huggingface/transformers/issues/4411
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import tensorflow_hub as hub
from rouge import Rouge 
from typing import List, Dict
from tqdm.notebook import tqdm
import numpy as np
import sacrebleu
import re
import sys
from loguru import logger

In [2]:
logger.info("Processing the utterances now...")

def load_slots_dict()-> Dict[str,str]:
    """
    RETURNS
    -------
        (dict of string): a list of all slots that we've encountered before
    """
    try:
        slots_dict ={}
        with open('known_slots.txt', 'r', encoding='utf8') as f:
            for row in f:
                SPLIT_TOKEN = ' <equals> '
                slot, placeholder  = row.split(SPLIT_TOKEN)
                slots_dict[slot] = placeholder.rstrip("\n")
        return slots_dict
    except:
        logger.warning('known_slots.txt not found. Please specify the instance of the slots!')
        return None
    
raw_sent= []
try:
    with open('sample_utterances.txt', 'r', encoding='utf8') as f:
        for sent in f:
            raw_sent.append(sent)
except:
    logger.warning("Please provide sample_utterances.txt.... Exitting now")
    sys.exit(1)
        
slots = set()
for sent in raw_sent:
    get_slots = re.findall(r'{(.*?)}', sent)
    slots.update(get_slots)

known_slot_dict = load_slots_dict()

#If can't find load_slots_dict
if not known_slot_dict:
    file1 = open("known_slots.txt", "a")
    for slot in slots:
        known_slot_dict={}
        placeholder = input("Teach me an instance of " + slot + ":")
        known_slot_dict[slot] = str(placeholder)
        file1.write(str(slot) + ' <equals> ' + str(placeholder) + '\n')
    file1.close()

#Unpack slots that we already know into a set
slots_known = {*known_slot_dict.keys()}

#If there exists some unidentifiable slots
if not slots.issubset(slots_known):
    slots_seen = slots.intersection(slots_known)
    unknown_slots = slots.difference(slots_seen)
    file1 = open("known_slots.txt", "a")
    for slot in unknown_slots:
        placeholder = input("Teach me an instance of " + slot + ":")
        known_slot_dict[slot] = str(placeholder)
        file1.write(str(slot) + ' <equals> ' + str(placeholder) + '\n')
    file1.close()
        
        
def multiple_replace(known_slots_dict: Dict[str,str], text: str):
    """
    Function takes a dictionary of known_slots (slots: placeholder) and replace everything in the text 
    that has the {slots} pattern with its placeholder.
    Reference
    ---------
    https://stackoverflow.com/questions/15175142/how-can-i-do-multiple-substitutions-using-regex-in-python
    
    ARGS
    ----
        known_slots_dict: dictionary of known_slots (slots: placeholder)
        text: raw sentence (with slots) to be replaced
    """ 
    # Create a regular expression  from the dictionary keys
    regex = re.compile("{(%s)}" % "|".join(map(re.escape, known_slots_dict.keys())))
    
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: known_slots_dict[mo.string[mo.start()+1:mo.end()-1]], text) 


# Replace slots in strings
CLEAN_SEN = []
for sent in raw_sent:
    CLEAN_SEN.append(multiple_replace(known_slot_dict,sent))
    
logger.info("Finished cleaning the utterances...")

2020-07-07 14:22:43.387 | INFO     | __main__:<module>:1 - Processing the utterances now...
2020-07-07 14:22:43.391 | INFO     | __main__:<module>:87 - Finished cleaning the utterances...


In [3]:
logger.info("Setupping up models to generate paraphrases...")
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('t5_paraphrase')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("Generating paraphrases on {device}.Recommend GPU for speed, of course!", device=device)
model = model.to(device)

logger.info("Preparing USE Embedding...Might have to download...")
#USE Embedder
url = "https://tfhub.dev/google/universal-sentence-encoder-large/4"
embed = hub.load(url)
logger.info("Done preparing USE Embedding")


#Beam-search config
MAX_SEQ_LEN = 256
# Number of Paraphrases you want to generate
NB_GENERATED = 200
# Top N to keep
TOP_TO_KEEP = 50
#TOP_P (values bwn 0-1): threshold to keep token for nucleus sampling
TOP_P = 0.90
TOP_K = 75


logger.info("Generating Paraphrase now")
#Put in proper format
INPUT_SEN = []
for sent in CLEAN_SEN:
    INPUT_SEN.append("paraphrase: " + sent + " </s>")

def get_n_best_para(input_sentence: str, paraphrases: List[str], top_n: int = 1) -> List[str]:
    """
    RETURNS
    -------
        (list of strings): top n paraphrases that are most semantically similar (using USE embeddings) and most
            different structurally (using L-Rouge) to the input_sentence
    """ 
    rouge = Rouge() 
    rouge_scrs = [1- rouge.get_scores(input_sentence, para)[0]['rouge-l']['f'] for para in paraphrases]
    
    #NOTE: Measure similarity using inner-product on USE embedding.
    #enc_input_sentence, *enc_paraphrases = self.embed([input_sentence] + paraphrases)
    enc_input_sentence = embed([input_sentence])
    enc_paraphrases = embed(paraphrases)
    MEANING_DIV_RATIO = 0.85
    scored_paraphrases = [
        (paraphrase, np.inner(enc_input_sentence['outputs'].numpy(), enc_paraphrase)[0] * MEANING_DIV_RATIO + score * (1-MEANING_DIV_RATIO))
        for (paraphrase, enc_paraphrase, score) in zip(paraphrases, enc_paraphrases['outputs'].numpy(),rouge_scrs)
    ]
    #Sort on meaning, then diversity
    top_n_paraphrases = sorted(scored_paraphrases, key=lambda x: x[1], reverse=True)[:top_n]
    return [x[0] for x in top_n_paraphrases]

para_list =[]
for idx, text in tqdm(enumerate(INPUT_SEN)):
    logger.info("Generating for utterance {}", idx+1)
    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    # https://huggingface.co/transformers/model_doc/t5.html?highlight=generate#overview
    # https://huggingface.co/transformers/main_classes/model.html?highlight=generate#transformers.PreTrainedModel.generate
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=MAX_SEQ_LEN,
        early_stopping=True,
        top_k=TOP_K,
        top_p=TOP_P,
        num_return_sequences=NB_GENERATED
    )
    paraphrases =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != text.lower() and sent not in paraphrases:
            paraphrases.append(sent)
            
    top_para = get_n_best_para(CLEAN_SEN[idx], paraphrases, TOP_TO_KEEP)
    print("INPUT SENTENCE :", CLEAN_SEN[idx])
    print("PARAPRHASES :")
    for i, paraphrase in enumerate(top_para):
        print("n°%d : %s" % (i, paraphrase))
    para_list.append(paraphrases)


2020-07-07 14:22:43.409 | INFO     | __main__:<module>:1 - Setupping up models to generate paraphrases...
2020-07-07 14:22:49.195 | INFO     | __main__:<module>:13 - Generating paraphrases on cpu.Recommend GPU for speed, of course!
2020-07-07 14:22:49.200 | INFO     | __main__:<module>:16 - Preparing USE Embedding...Might have to download...
INFO:absl:Using /var/folders/3l/0t74styx5wz0y5tkfgsvhycwfc38wl/T/tfhub_modules to cache modules.
2020-07-07 14:22:59.864 | INFO     | __main__:<module>:20 - Done preparing USE Embedding
2020-07-07 14:22:59.864 | INFO     | __main__:<module>:34 - Generating Paraphrase now


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

2020-07-07 14:22:59.882 | INFO     | __main__:<module>:65 - Generating for utterance 1


INPUT SENTENCE : What is the sales of the title for period?
PARAPRHASES :
n°0 : What are the sales of the title for period?
n°1 : What is the sales of the title for period?
n°2 : What are the sales of the title during a period?
n°3 : What are the sales of title for period?
n°4 : How much sales does a title for period do?
n°5 : What are the sales of the title in period?
n°6 : What are the sales of a title for period?
n°7 : What is the sales of title for period?
n°8 : What are sales of the title for period?
n°9 : What are sales of title for period?
n°10 : What is the sales of a title for period?
n°11 : What are sales of titles of period?
n°12 : What are sales of a title for period?
n°13 : What is the sale of the title for period?
n°14 : How much are the sales of the title for period?
n°15 : What is the sale of title for period?
n°16 : How does the sales of a title of period?
n°17 : What is the sale of a title for period?
n°18 : How much are sales of title for period?
n°19 : How does the 

In [4]:
import spacy
import textacy
def mult_repl_text(replace_dict: Dict[str,str], text: str):
    """
    Function takes a dictionary of known_slots {noun clause: slot} and replace all noun clause in text into slots.
    Reference
    ---------
    https://stackoverflow.com/questions/15175142/how-can-i-do-multiple-substitutions-using-regex-in-python
    
    ARGS
    ----
        replace_dict: dictionary of {noun_clause: slots} 
        text: raw sentence (with noun clauses) to be replaced
    RETURNS
    -------
        (string): text with all noun clauses replaced to slots
    """ 
    # Create a regular expression  from the dictionary keys
    regex = re.compile("(%s)" % "|".join(map(re.escape,replace_dict.keys())))
    
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: replace_dict[mo.string[mo.start():mo.end()]], text) 

nlp = spacy.load('en_core_web_md', parser=False)
final_list = []
for paraphrases in para_list:
    for para in paraphrases:
        dict_replace = {}
        doc = nlp(para)
        for nc in doc.noun_chunks: # use np instead of np.text
            for slot in slots:
                if nlp.vocab[known_slot_dict[slot]].similarity(nc) > 0.8:
                    dict_replace[nc.text] = '{' + slot + '}'
        if dict_replace:
            cleaned_sent = mult_repl_text(dict_replace, para)
            print (cleaned_sent)
            print("--------")
            final_list.append(cleaned_sent)

What is the sales of {game}?
--------
What is the sales of {game} for {timeInterval}?
--------
What are the sales of {game} for {timeInterval}?
--------
What are the sales of {game} for {timeInterval}?
--------
What is the sale of {game} for {timeInterval}?
--------
What is the sale of {game} for {timeInterval}?
--------
How is the sales of {game} for {timeInterval}?
--------
How does the sales of {game} for {timeInterval}?
--------
What is the sales of {game} for {timeInterval}?
--------
What is the sale of {game}?
--------
What are the sales of {game} for {timeInterval}?
--------
What is the sales of {game} for {timeInterval}?
--------
How many people are sold of {game} in {timeInterval}?
--------
How is sales of {game} for {timeInterval}?
--------
What is the sales of {game}?
--------
How much sales did a name get for {timeInterval}?
--------
How can we calculate sales of {game} for {timeInterval}?
--------
What are the sales of {game}?
--------
What are sales of {game} for {timeInt

In [13]:
import json

target ={
  "metadata": {
    "schemaVersion": "1.0",
    "importType": "LEX",
    "importFormat": "JSON"
  },
  "resource": {
    "description": "intent description",
    "name": "",
    "version": "version number",
    "fulfillmentActivity": {
      "type": "ReturnIntent"
    },
    "sampleUtterances": []
  }
}
INTENT_NAME = "Hello_World"

target["resource"]["name"] = INTENT_NAME
target["resource"]["sampleUtterances"] = final_list
json_out = json.dumps(target, indent=4)

with open(f"{INTENT_NAME}.json", "w") as f: 
    f.write(json_out) 
from zipfile import ZipFile

with ZipFile("new_intent.zip", "w") as newzip:
        newzip.write(f"{INTENT_NAME}.json")

