

```
# GPT2
```



In [None]:
# receives
# .txt file containing a list of question-answer pairs identified by 'P: ' and 'R: '
# .txt file with a list of questions, one question per line

# or

# .txt file containing raw text
# .txt file with a list of questions, one question per line

# retrieves
# .txt file containing the posed questions and respective GPT-2's answers, identified by 'P: ' and 'R: '

In [None]:
pip install tensorflow
pip install gpt-2-simple

In [None]:
import gpt_2_simple as gpt2
gpt2.download_gpt2(model_name = "355M")

In [None]:
# file management
def open_file(filename):
    read_file = open(filename, 'r')
    file_cont = read_file.readlines()
    read_file.close()

    return file_cont

def write_file(filename, content):
    file_write = open(filename, 'w')
    file_write.writelines(content)
    file_write.close()

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# finetunes GPT-2 355M model
def finetune_gpt2(domain_file_path, run_name, steps):
    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess, 
                dataset = domain_file_path,
                model_name = '355M',
                steps = steps,
                run_name = run_name,
                print_every = 10,
                sample_every = 100
                )
    
    print('GPT-2 Finetune Complete')
    return sess

In [None]:
# retrieves a file with the posed questions and GPT2's answers
def retrieve_ans(domain_type, sess, run_name, questions_file_path, save_file_path):
    count = 0
    questions = open_file(questions_file_path)
    responses = []

    for question in questions:
        # adds identifiers as a prefix
        if domain_type == 'faqs':
            ques_prefix = 'P: ' + question + 'R: '
            include_prefix = True
            
        elif domain_type == 'text':
            ques_prefix = question
            include_prefix = False

        # answer generation
        answer = gpt2.generate(sess = sess,
                run_name = run_name,
                # the higher the temperature, the higher the randomness in generation
                temperature = 0.2,
                prefix = ques_prefix,
                include_prefix = include_prefix,
                truncate = '\n\n',
                nsamples = 1,
                return_as_list = True)[0]

        if domain_type == 'faqs':
             with open(save_file_path, 'a') as f:
                f.write(answer)
                f.write('\n')
                f.write('\n')

        elif domain_type == 'text':
            aux_ques = 'P: ' +  question
            aux_res = 'R: ' + answer
            
            with open(save_file_path, 'a') as f:  
                f.write(aux_ques)
                f.write(aux_res)
                f.write('\n')
                f.write('\n')

        count += 1
        print('Answer %d retrieved!' % count)

    print('File with posed questions and respective answers created!')

In [None]:
# NOTES

# domain_file_path - path to the file containing the domain
# can be a file containing question-answer pairs identified with 'P: ' and 'R: ', respectively or a file containing raw text
# FAQs                      # Text
# P: question1              # Paragraph1Line1
# R: answer1                # Paragraph1Line2
# \n                        # \n
# P: question2              # Paragraph2Line1
# R: answer2                # Paragraph2Line2
# \n                        # \n
# must be a .txt file

# run_name - name for gpt-2 finetuning session
# used to continue finetuning from a given step

# steps - number of steps to perform finetuning

# domain_type - 'faqs', for a list of question-answer pairs identified with 'P: ' and 'R: '
# domain_type - 'text', for a file containing unstructred raw text

# questions_file_path - path to the file containing all questions, one question per line
# Q1
# Q2
# Q3
# ...
# must be a .txt file

# save_file_path - path to the file where the posed questions and respective retrieved answers are to be saved

In [None]:
run_name = # 'run_name'
steps = # steps
domain_type = # 'faqs' or 'text'

domain_file_path = # 'domain_file_path'
input_questions_file_path = # 'input_questions_file_path'
save_file_path = # 'save_file_path'

sess = finetune_gpt2(domain_file_path, run_name, steps)
retrieve_ans('faqs', sess, run_name, questions_file_path, saving_file_path)