In [None]:
!pip install torch==1.4.0
!pip install transformers==2.9.0
!pip install pytorch_lightning==0.7.5

## Run inference with any question as input

In [None]:
# https://github.com/huggingface/transformers/issues/4411
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import tensorflow_hub as hub
from rouge import Rouge 
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import sacrebleu
import re
from loguru import logger

In [None]:
logger.info("Processing the utterances now...")

def load_slots_dict()-> Dict[str,str]:
    """
    RETURNS
    -------
        (dict of string): a list of all slots that we've encountered before
    """
    try:
        slots_dict ={}
        with open('known_slots.txt', 'r', encoding='utf8') as f:
            for row in f:
                SPLIT_TOKEN = ' <equals> '
                slot, placeholder  = row.split(SPLIT_TOKEN)
                slots_dict[slot] = placeholder.rstrip("\n")
        return slots_dict
    except:
        logger.warning('known_slots.txt not found. Please specify the instance of the slots!')
        return None
    
raw_sent= []
with open('sample_utterances.txt', 'r', encoding='utf8') as f:
    for sent in f:
        raw_sent.append(sent)
        
slots = set()
for sent in raw_sent:
    get_slots = re.findall(r'{(.*?)}', sent)
    slots.update(get_slots)

known_slot_dict = load_slots_dict()

#If can't find load_slots_dict
if not known_slot_dict:
    for slot in slots:
        known_slot_dict={}
        file1 = open("known_slots.txt", "a")
        placeholder = input("Teach me an instance of " + slot + ":")
        known_slot_dict[slot] = str(placeholder)
        file1.write(str(slot) + ' <equals> ' + str(placeholder) + '\n')
        file1.close()

#Unpack slots that we already know into a set
slots_known = {*known_slot_dict.keys()}

#If there exists some unidentifiable slots
if not slots.issubset(slots_known):
    slots_seen = slots.intersection(slots_known)
    unknown_slots = slots.difference(slots_seen)
    for slot in unknown_slots:
        file1 = open("known_slots.txt", "a")
        placeholder = input("Teach me an instance of " + slot + ":")
        known_slot_dict[slot] = str(placeholder)
        file1.write(str(slot) + ' <equals> ' + str(placeholder) + '\n')
        file1.close()
        
        
def multiple_replace(known_slots_dict: Dict[str,str], text: str):
    """
    Function takes a dictionary of known_slots (slots: placeholder) and replace everything in the text 
    that has the {slots} pattern with its placeholder.
    Reference
    ---------
    https://stackoverflow.com/questions/15175142/how-can-i-do-multiple-substitutions-using-regex-in-python
    
    ARGS
    ----
        known_slots_dict: dictionary of known_slots (slots: placeholder)
        text: raw sentence (with slots) to be replaced
    """ 
    # Create a regular expression  from the dictionary keys
    regex = re.compile("{(%s)}" % "|".join(map(re.escape, known_slots_dict.keys())))
    
    # For each match, look-up corresponding value in dictionary
    return regex.sub(lambda mo: known_slots_dict[mo.string[mo.start()+1:mo.end()-1]], text) 


# Replace slots in strings
CLEAN_SEN = []
for sent in raw_sent:
    CLEAN_SEN.append(multiple_replace(known_slot_dict,sent))
    
logger.info("Finished cleaning the utterances...")

In [None]:
logger.info("Setupping up models to generate paraphrases...")
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

model = T5ForConditionalGeneration.from_pretrained('t5_paraphrase')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("Generating paraphrases on {device}.Recommend GPU for speed, of course!", device=device)
model = model.to(device)

logger.info("Preparing USE Embedding...Might have to download...")
#USE Embedder
url = "https://tfhub.dev/google/universal-sentence-encoder-large/4"
embed = hub.load(url)
logger.info("Done preparing USE Embedding")


#Beam-search config
MAX_SEQ_LEN = 256
# Number of Paraphrases you want to generate
NB_GENERATED = 100
# Top N to keep
TOP_TO_KEEP = 10
#TOP_P (values bwn 0-1): threshold to keep token for nucleus sampling
TOP_P = 0.90
#TODO: Do a sweep in the right top_k/top_p value.

logger.info("Generating Paraphrase now")
#Put in proper format
INPUT_SEN = []
for sent in CLEAN_SEN:
    INPUT_SEN.append("paraphrase: " + sent + " </s>")

def get_n_best_para(input_sentence: str, paraphrases: List[str], top_n: int = 1) -> List[str]:
    """
    RETURNS
    -------
        (list of strings): top n paraphrases that are most semantically similar (using USE embeddings) and most
            different structurally (using L-Rouge) to the input_sentence
    """ 
    rouge = Rouge() 
    rouge_scrs = [1- rouge.get_scores(input_sentence, para)[0]['rouge-l']['f'] for para in paraphrases]
    
    #NOTE: Measure similarity using inner-product on USE embedding.
    #enc_input_sentence, *enc_paraphrases = self.embed([input_sentence] + paraphrases)
    enc_input_sentence = embed([input_sentence])
    enc_paraphrases = embed(paraphrases)
    MEANING_DIV_RATIO = 0.65
    scored_paraphrases = [
        (paraphrase, np.inner(enc_input_sentence['outputs'].numpy(), enc_paraphrase)[0] * MEANING_DIV_RATIO + score * (1-MEANING_DIV_RATIO))
        for (paraphrase, enc_paraphrase, score) in zip(paraphrases, enc_paraphrases['outputs'].numpy(),rouge_scrs)
    ]
    #Sort on meaning, then diversity
    top_n_paraphrases = sorted(scored_paraphrases, key=lambda x: x[1], reverse=True)[:top_n]
    return [x[0] for x in top_n_paraphrases]

for idx, text in tqdm(enumerate(INPUT_SEN)):
    logger.info("Generating for utterance {}", idx+1)
    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    # https://huggingface.co/transformers/model_doc/t5.html?highlight=generate#overview
    # https://huggingface.co/transformers/main_classes/model.html?highlight=generate#transformers.PreTrainedModel.generate
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=MAX_SEQ_LEN,
        early_stopping=True,
        top_k=100,
        top_p=TOP_P,
        num_return_sequences=NB_GENERATED
    )
    paraphrases =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != text.lower() and sent not in paraphrases:
            paraphrases.append(sent)
            
    top_para = get_n_best_para(CLEAN_SEN[idx], paraphrases, TOP_TO_KEEP)
    print("INPUT SENTENCE :", CLEAN_SEN[idx])
    print("PARAPRHASES :")
    for i, paraphrase in enumerate(top_para):
        print("n°%d : %s" % (i, paraphrase))    


In [None]:
import wandb
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
import tensorflow_hub as hub
from rouge import Rouge 
from typing import List, Dict
from tqdm import tqdm
import numpy as np
from loguru import logger

sweep_config = {
    "name": "My Sweep",
    "method": "grid",
    "parameters":{
        "top_k":{
            "values" : [50,75,100, 125, 150,175]
        },
        "top_p":{
            "values": [0.5, 0.6, 0.7, 0.8, 0.9, 1]
        },
        "num_return_sequences":{
            "values": [10, 20, 40, 60, 80, 100, 300, 500, 800]
        },
        
    },
}

sweep_id = wandb.sweep(sweep_config, project='project-name')

model = T5ForConditionalGeneration.from_pretrained('t5_paraphrase')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

tokenizer = T5Tokenizer.from_pretrained('t5-base')
url = "https://tfhub.dev/google/universal-sentence-encoder-large/4"
embed = hub.load(url)

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

def train():
    hyperparam_defaults = dict(
        top_k = 40,
        top_p = 0.4,
        num_return_sequences = 5,
    )
    wandb.init(project='hyperparameter-sweeps-comparison', config=hyperparam_defaults)
    config = wandb.config
    set_seed(42)
    MAX_SEQ_LEN = 256
    INPUT_SEN = "What is the sales of the game on the platform this year?"
    text = "paraphrase: " + INPUT_SEN + " </s>"
    
    encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
    
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=MAX_SEQ_LEN,
        early_stopping=True,
        top_k= config.top_k,
        top_p= config.top_p,
        num_return_sequences= config.num_return_sequences
    )
    paraphrases =[]
    for beam_output in beam_outputs:
        sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        if sent.lower() != text.lower() and sent not in paraphrases:
            paraphrases.append(sent)
            
    enc_input_sentence = embed([INPUT_SEN])
    enc_paraphrases = embed(paraphrases)
    
    avg_USE = np.mean([np.inner(enc_input_sentence['outputs'].numpy(),
                                enc_paraphrase)[0] for enc_paraphrase in enc_paraphrases['outputs'].numpy()])
    avg_rouge = np.mean([1- rouge.get_scores(input_sentence, para)[0]['rouge-l']['f'] for para in paraphrases])
    metrics = {'avg_embed score': avg_USE, 'avg_rouge': avg_rouge}
    wandb.log(metrics)

wandb.agent(sweep_id, function=train)


Create sweep with ID: mjehgs9a
Sweep URL: https://app.wandb.ai/leoputera2407/project-name/sweeps/mjehgs9a


INFO:absl:Using /var/folders/3l/0t74styx5wz0y5tkfgsvhycwfc38wl/T/tfhub_modules to cache modules.
INFO:wandb.wandb_agent:Running runs: []
INFO:wandb.wandb_agent:Agent received command: run
INFO:wandb.wandb_agent:Agent starting run with config:
	num_return_sequences: 10
	top_k: 50
	top_p: 0.5


wandb: Agent Starting Run: jvi1tfnz with config:
	num_return_sequences: 10
	top_k: 50
	top_p: 0.5
wandb: Agent Started Run: jvi1tfnz




INFO:wandb.wandb_agent:Running runs: ['jvi1tfnz']
