## DialoGPT - Main

In [None]:
!pip install transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
step = 0
new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

>> User:Do you mind providing your email address?


In [None]:
# Using the same input and then passing it to DialoGPT over and over again without any history

In [None]:
for i in range(19):
  
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=True, 
          top_k=100, 
          top_p=0.7,
          temperature=0.8)


  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: Just sent you a PM.
DialoGPT: I PM'd you.
DialoGPT: You can email me at daniel.co.uk
DialoGPT: No problem, sent you a PM.
DialoGPT: Yes please.
DialoGPT: I don't mind at all. I'll PM you.
DialoGPT: I'll send it to you now.
DialoGPT: No problem, I sent you a message.
DialoGPT: I sent you a PM
DialoGPT: Just PM'd you.
DialoGPT: No problem. I'll send it to you.
DialoGPT: No problem. PM me.
DialoGPT: PM sent.
DialoGPT: PM sent.
DialoGPT: Sure thing.
DialoGPT: PM sent
DialoGPT: I sent you a message.
DialoGPT: I'm pretty sure that I sent you a PM with my email address.
DialoGPT: Yes I'll send it to you via PM.


In [None]:
# Let's see how the model responds if we ask it the same question over and over again but this time 
#we are providing it some history.

In [None]:
prompt = 'Do you mind providing your email address?'

In [None]:
for step in range(10): 
  new_user_input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors='pt')
  bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
  
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=True, 
          top_k=100, 
          top_p=0.7,
          temperature=0.8)

 
  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: No problem, I'll PM you.
DialoGPT: It's a joke.
DialoGPT: Do u mind providing a response to my email address?
DialoGPT: I'm pretty sure I already did
DialoGPT: That's just what I do.
DialoGPT: And you are not?
DialoGPT: Sure, I will do that.
DialoGPT: You are you kidding?
DialoGPT: Heh?
DialoGPT: Are you?


In [None]:
# well, I would have expected the same reply from someone if I kept asking for his email over and over again :-)

# This one is not useful, I guess.
# Let's play around with the values of temperature using the original approach (no history)

In [None]:
step = 0
new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

>> User:Do you mind providing your email address?


In [None]:
for i in range(19):
  
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=True, 
          top_k=100, 
          top_p=0.7,
          temperature=1.0)

 
  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: sent you a PM.
DialoGPT: Yes, it's a private message.
DialoGPT: no problem! i sent you one.
DialoGPT: It's a secret
DialoGPT: Just sent it.
DialoGPT: PM'd it to you
DialoGPT: PM'd you
DialoGPT: pm me
DialoGPT: just sent it to you
DialoGPT: Yeah sure, I'll PM you it.
DialoGPT: Sure, what's yours?
DialoGPT: PM sent.
DialoGPT: Hey, yeah, I'll PM you it.
DialoGPT: no, I'll PM it to you
DialoGPT: Done, thanks!
DialoGPT: Sure thing!
DialoGPT: I'll send you a message.
DialoGPT: PM'd you.
DialoGPT: It's on the top of the page.


In [None]:
for i in range(10):
  
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=True, 
          top_k=100, 
          top_p=0.7,
          temperature=0.6)

  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: PM sent
DialoGPT: PM sent.
DialoGPT: I sent you a PM
DialoGPT: I sent you a PM.
DialoGPT: Sure, I'll PM it to you.
DialoGPT: PM sent.
DialoGPT: Sent you a PM.
DialoGPT: PM sent.
DialoGPT: Sent you a PM
DialoGPT: I sent you a PM


In [None]:
# so high temperature results in more diverse and creative answers even though it decreases its confidence
# in its top choices

# "Lower temperatures make the model increasingly confident in its top choices, while temperatures greater than 1 decrease confidence."


In [None]:
# let's turn off the sampling
for i in range(10):
   
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=False, 
          top_k=100, 
          top_p=0.7,
          temperature=0.9)

  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.
DialoGPT: I sent you a PM.


In [None]:
# as expected, as sampling is off, same answer will be generated everytime

In [None]:
#let's play around with top_p and top_k
for i in range(10):
  chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
          no_repeat_ngram_size=3,       
          do_sample=True, 
          top_k=100, 
          top_p=0.9,
          temperature=0.9)


  print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

DialoGPT: I think they just ask you to provide a username and email address
DialoGPT: No problem. Sent you a message
DialoGPT: I can send you the imgur link when i get home.
DialoGPT: sent you a message
DialoGPT: PM'd. Sorry for not getting back to you sooner!
DialoGPT: Sure, sent you a PM.
DialoGPT: Sure thing, PM me with your email.
DialoGPT: Hey it's me, your email.
DialoGPT: I'll PM you.
DialoGPT: I've emailed you a code


In [None]:
# so top_p will help in increasing the length of the generated answer but as top p is quite high(0.9), increasing
# top_k will quickly result in going off topic

In [None]:
# In short- To generate diverse answers from DialoGPT, turn on the sampling, provide no history, increase the
# temperature slightly and keep top_k in the range of 100 while maximizing top_p.

## DialoGPT - Revisited - Functional Implementation

In [None]:
!pip install transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import numpy as np
import pandas as pd

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
model.to(device)

### Get Answers function

In [None]:
def get_answer(prompts,hyperParameters,num_responses):
  '''
  prompts: List of input questions
  hyperParameters: Dictionary of hyper-parameters
  num_responses: Number of responses to generate for each question.
  '''
  
  answers_with_prompt = []
  for prompt in prompts:
      print(prompt)
      answers = []
      step = 0
      new_user_input_ids = tokenizer.encode(prompt + tokenizer.eos_token, return_tensors='pt').to(device)
      bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

      if (hyperParameters['top_p_step'] is not None ):
        for i in range(num_responses):
          for top_p in np.arange(hyperParameters['top_p'][0],hyperParameters['top_p'][1],hyperParameters['top_p_step']):
            chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
                    no_repeat_ngram_size=3,       
                    do_sample=hyperParameters['do_sample'], 
                    top_k=hyperParameters['top_k'], 
                    top_p=top_p,
                    temperature=hyperParameters['temperature'])
                    
            answers.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
      
      else:
        for i in range(num_responses):
          chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
                    no_repeat_ngram_size=3,       
                    do_sample=hyperParameters['do_sample'], 
                    top_k=hyperParameters['top_k'], 
                    top_p=hyperParameters['top_p'],
                    temperature=hyperParameters['temperature'])
          answers.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))
      answers_with_prompt.append((answers,prompt))
  return answers_with_prompt

### Create CSV function

In [None]:
def createCSV(paraphrased_answers_list, path, name_of_the_file):
  '''
  Parameters:-
    paraphrases_list: output of the get_response function
    path: path where the csv file should be stored
    name_of_the_file: name  of the output csv file

  '''
  data = {'input_sentences':[],'output_sentences':[]}
  for (answers, question) in paraphrased_answers_list:
    rows_answers = answers
    rows_questions = [question]* len(answers)
    data['input_sentences'] += rows_questions
    data['output_sentences'] += rows_answers
  
  dataFrame = pd.DataFrame(data, columns=['input_sentences','output_sentences'])

  # removing duplicate answers
  dataFrame = dataFrame.drop_duplicates(subset=['output_sentences'])
  
  dataFrame.to_csv(path+'/'+name_of_the_file)
  return dataFrame

### Hyper-Parameters Explanation

**About DialoGPT**

DialoGPT is formulated as an autoregressive (AR) language model. It is based on the mult-layer transformer architecture, much similar to that of GPT-2. DialoGPT is trained on 147M multi-turn dialogues extracted from Reddit discussion threads.

----------------------------------------------------------
**Hyper-Parameters:-**

***-> Temperature:*** Temperature sampling is implemented by dividing logits by the temperature value before feeding them into softmax. Temperature greater than 1 generally decreases the confidence while lower temperature makes the model increasingly confident in its top choices. Therefore, we have to find a balance for the value of temperature. If we decrease the temperature too much, the model will become too confident in its top choices and will result in genetating the same answer again and again. On the other hand, if we increase the temperature too much, it will start diverting from the original intent. A value of `0.8` was wound to be perfect for our case (The value was choosen after testing multiple values.)

----------------------------------------------------------
***-> Top_k and Top_p:*** Top_k means that we are sorting by probability and then zero-ing out the probabilities for anything below the k'th token. In short, sort the output tokens by probabilities, select top k, and then disregard everything that doesn't fall in the first k output tokens. This can reduce the probability space from which we are sampling our output because we are selecting the top k outs and then disregarding everything else. If K is small, this can result in less diversity and variation of the output answers. We can make the top_k large but that can result in incuding totally off topic words. 

To address the above problems, top_p is used. It introdcues commulative distribution. As soon as the Commulative Distribution Frequency exceeds p, the remaining tokens are cut off from the probability space. 

In other words, instead of sampling only from the most likely K words, in Top-p sampling chooses from the smallest possible set of words whose cumulative probability exceeds the probability p. The probability mass is then redistributed among this set of words. This way, the size of the set of words (a.k.a the number of words in the set) can dynamically increase and decrease according to the next word's probability distribution.

<img src="https://huggingface.co/blog/assets/02_how-to-generate/top_p_sampling.png" alt="drawing" width="500"/>

*In this way, we still avoid sampling egregiously wrong tokens, but preserve variety when the highest scoring tokens have low confidence.*

**!!Important** : For our case, after extensive testing, we found out that `top_p` will help in increasing the length of the generated answer but as top p is quite high`(0.9)`, increasing `top_k` will quickly result in going off topic.

So in short, To generate diverse answers from DialoGPT, turn on the sampling, provide no history, increase the
temperature slightly and keep top_k in the range of 100 while maximizing top_p.

For detailed explanation, please refer to [HuggingFace](https://huggingface.co/blog/how-to-generate)

----------------------------------------------------------
***-> do_sample***: In its most basic form, sampling means randomly picking the next word according to its conditional probability distribution. So, as the name suggests, when sampling is off model will produce the same result over and over again without sampling other tokens from the distribution.

### Sample Outputs

In [None]:
# without using paraphrasing on the input sentence
prompts = ['Do you mind providing your email address?'] # The input prompt | Specify the input prompts as a list of strings

# Note: By default the top_p value will be a range
# If you want to use an absolute value for top_p
# just replace the tuple with the absolute value and set
# the 'top_p_step = None'

# Note, when using range of top_p, the total number of paraphrases
# generated will be:-
'''total_responses = num_responses * (top_p[1]/top_p_step - (top_p[0]/top_p_step))'''

# initialize the hyper-parameters
hyperParameters = {'do_sample':True,
                   'top_k':100,
                   'top_p':(0.5,10),
                   'top_p_step':0.5,
                   'temperature':0.8}


num_responses = 20
out = get_answer(prompts, hyperParameters,num_responses)

Do you mind providing your email address?


In [None]:
output = createCSV(out,'/content','question_answer_pairs.csv')

In [None]:
output

Unnamed: 0,input_sentences,output_sentences
0,Do you mind providing your email address?,Sent you a PM.
1,Do you mind providing your email address?,"I sent you a message, thanks!"
2,Do you mind providing your email address?,"Sure, I sent you a PM."
3,Do you mind providing your email address?,"I do mind, and it's a gmail, if that helps."
4,Do you mind providing your email address?,"yes, just sent you a PM"
...,...,...
375,Do you mind providing your email address?,"Sure, just PM it to me."
376,Do you mind providing your email address?,PMing you!
377,Do you mind providing your email address?,yeah I sent you a PM.
378,Do you mind providing your email address?,sent via PM


In [None]:
unique = []
for (sentences,_) in out:
  for sentence in sentences:
    if sentence not in unique:
      unique.append(sentence)

In [None]:
# Output directly taken from pegasus (num_sequences = 10, beam = 20, chaining = False)
# Right now, for testing, I have copied the output from other notebook to this notebook
# I will merge both once everything is finalized

prompts = ['Do you want to provide your email address?',
   'Do you want to give your email address?',
   'Are you willing to give your email address?',
   'Are you willing to provide your email address?',
   'Do you want to provide an email address?',
   'Do you want to send an email?',
   'Do you want your email address to be public?',
   'Do you want to use your email address?',
   'Do you have an email address?',
   'Is it okay to give your email address?']
# You can also specify multiple types of questions above

# Note: By default the top_p value will be a range
# If you want to use an absolute value for top_p
# just replace the tuple with the absolute value and set
# the 'top_p_step = None'

# Note, when using range of top_p, the total number of paraphrased
# generated will be:-
'''total_responses = num_responses * (top_p[1]/top_p_step - (top_p[0]/top_p_step))'''

# initialize the hyper-parameters
hyperParameters = {'no_repeat_ngram_size':3,
                   'do_sample':True,
                   'top_k':100,
                   'top_p':(0.5,10),
                   'top_p_step':0.5,
                   'temperature':0.8}


num_responses = 10
out = get_answer(questions_paraphrased, hyperParameters,num_responses)

Do you want to provide your email address?
Do you want to give your email address?
Are you willing to give your email address?
Are you willing to provide your email address?
Do you want to provide an email address?
Do you want to send an email?
Do you want your email address to be public?
Do you want to use your email address?
Do you have an email address?
Is it okay to give your email address?


In [None]:
createCSV(out,'/content','question_answer_pairs_1.csv')

Unnamed: 0,input_sentences,output_sentences
0,Do you want to provide your email address?,I sent you a PM.
1,Do you want to provide your email address?,Yeah sure!
2,Do you want to provide your email address?,"No, no, no. I just want to know what's going o..."
3,Do you want to provide your email address?,"Thank you, I'll try that and see if the fix works"
4,Do you want to provide your email address?,Yes. Please. Let me know. I need to know becau...
...,...,...
1895,Is it okay to give your email address?,"As long as it makes sense, yup."
1896,Is it okay to give your email address?,"Sure, why not. Do you want anything in return?"
1897,Is it okay to give your email address?,"I just wanted to let you know, you're amazing ..."
1898,Is it okay to give your email address?,Feel free.


In [None]:


unique = []
for (sentences,_) in out:
  for sentence in sentences:
    if sentence not in unique:
      unique.append(sentence)

## GPT-J

In [None]:
!pip install gptj

Collecting gptj
  Downloading gptj-2.2.5-py3-none-any.whl (14 kB)
Collecting cryptography
  Downloading cryptography-35.0.0-cp36-abi3-manylinux_2_24_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 14.4 MB/s 
Collecting ProfanityDetector
  Downloading ProfanityDetector-0.2-py3-none-any.whl (14 kB)
Installing collected packages: ProfanityDetector, cryptography, gptj
Successfully installed ProfanityDetector-0.2 cryptography-35.0.0 gptj-2.2.5


In [None]:
from GPTJ.Basic_api import SimpleCompletion

prompt = "why do you want my email address?"

In [None]:
max_length = 100
temperature = 0.09
top_probability = 1.0

In [None]:
query = SimpleCompletion(prompt, length=max_length, t=temperature, top=top_probability)

In [None]:
query.simple_completion()



I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not a spam bot.

I am a human being, not


'\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not a spam bot.\n\nI am a human being, not'

In [None]:
from GPTJ.gptj_api import Completion
context = "chatbot"

In [None]:
examples = {
    "Do you mind providing your email address?":"Why do you want my email address?",
    "Do you mind providing your email address?": "Do I have to?",
    "Are you interested in a software engineer role?": "Are you interested in a software engineer role?",
    "Are you interested in a software engineer role?": "Yes, I would love to work as a software engineer.",
    "Would you mind sharing your CV": "Absolutely not. Here it is.",
    "What is your name?": "My full name is M.Mehyar Ali"}


In [None]:
context_setting = Completion(context, examples)

In [None]:
prompt = "Would you mind sharing your phone number?"
temperature = 0.09
top_probability = 1.0
response = context_setting.completion(prompt,
              temperature=temperature,
              top_p=top_probability)


In [None]:
print(response)

I'm not sure if this is the right place to ask this, but I'm not sure where else to ask.
I'm a new user of Ubuntu and I'm having a problem with my phone. I have a Samsung Galaxy S3 and I'm using the latest version of Ubuntu. I'm not sure if this is the right place to ask this, but I'm not sure where else to ask.
I'm having a problem with my phone. I have a


In [None]:
!pip install transformers