In [None]:
!pip install transformers
import tensorflow as tf
import tensorflow_hub as hub
import transformers
import os
import numpy as np

In [None]:
zendesk_articles = "/content/drive/MyDrive/Colab Notebooks/Machine Learning/data/ZendeskArticles"

## 0-qa.py

In [None]:
def question_answer(question, reference):
    """
    finds a snippet of text within a reference document to answer a question:

    question - a string containing the question to answer
    reference - a string containing the reference document from which to find the answer
    Returns: a string containing the answer (or None if no answer found)
    
    Uses bert-uncased-tf2-qa model from the tensorflow-hub library
    uses the pre-trained BertTokenizer, bert-large-uncased-whole-word-masking-finetuned-squad, from the transformers library.
    """
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    model = hub.load("https://tfhub.dev/see--/bert-uncased-tf2-qa/1")

    question_tokens = tokenizer.tokenize(question)
    paragraph_tokens = tokenizer.tokenize(reference)
    tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
    input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_word_ids)
    input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)

    input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
        tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
    outputs = model([input_word_ids, input_mask, input_type_ids])
    # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
    short_start = tf.argmax(outputs[0][0][1:]) + 1
    short_end = tf.argmax(outputs[1][0][1:]) + 1
    answer_tokens = tokens[short_start: short_end + 1]
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    # print(f'Question: {question}')
    # print(f'Answer: {answer}')
    return answer



In [None]:
with open(zendesk_articles + '/PeerLearningDays.md') as f:
    reference = f.read()

print(question_answer('When are PLDs?', reference))


on - site days from 9 : 00 am to 3 : 00 pm


## 1. Create the loop

In [None]:
while True:
  print("Q: ", end="")
  inpt = input()
  if inpt.lower() in ["exit", "quit", "goodbye", "bye"]:
    print("A: Goodbye")
    break
  print(f"A: ")

Q: Hello
A: 
Q: How are you?
A: 
Q: BYE
A: Goodbye


## 2. Answer Questions

In [None]:
def answer_loop(reference):
    """
    reference is the reference text
    If the answer cannot be found in the reference text, respond with Sorry, I do not understand your question.
    """

    tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
    model = hub.load("https://tfhub.dev/see--/bert-uncased-tf2-qa/1")
    paragraph_tokens = tokenizer.tokenize(reference)

    while True:
      print("Q: ", end="")
      inpt = input()      
      if inpt.lower() in ["exit", "quit", "goodbye", "bye"]:
        print("A: Goodbye")
        break

      question_tokens = tokenizer.tokenize(inpt)
      
      tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
      input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_mask = [1] * len(input_word_ids)
      input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)
      input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
          tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
      outputs = model([input_word_ids, input_mask, input_type_ids])
      # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
      short_start = tf.argmax(outputs[0][0][1:]) + 1
      short_end = tf.argmax(outputs[1][0][1:]) + 1
      answer_tokens = tokens[short_start: short_end + 1]
      answer = tokenizer.convert_tokens_to_string(answer_tokens)
      # print(f'Question: {question}')
      # print(f'Answer: {answer}')
      if answer:
        print(f"A: {answer}")
      else:
        print("A: Sorry, I do not understand your question.")

In [None]:
with open(zendesk_articles+'/PeerLearningDays.md') as f:
    reference = f.read()

answer_loop(reference)

Q: What time are PLDs?
A: 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
A: Sorry, I do not understand your question.
Q: What does PLD stand for?
A: peer learning days
Q: EXIT
A: Goodbye


## 3. Semantic Search

In [None]:
def semantic_search(corpus_path, sentence):
    """
    that performs semantic search on a corpus of documents:

    corpus_path is the path to the corpus of reference documents on which to perform semantic search
    sentence is the sentence from which to perform semantic search
    Returns: the reference text of the document most similar to sentence
    """
    docs = [sentence]
    for doc in os.listdir(corpus_path):
      if doc[-3:] == ".md":
        with open(zendesk_articles + "/" + doc) as f:
          docs.append(f.read())
    univ_sent_enc = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
    embeddings = univ_sent_enc(docs)
    cosine_sims = []
    for out in embeddings:
        cosine_sims.append(np.dot(embeddings[0], out)/(np.linalg.norm(embeddings[0])*np.linalg.norm(out)))
    docnum = np.array(cosine_sims[1:]).argmax() + 1
    return docs[docnum]

In [None]:
print(semantic_search(zendesk_articles, 'When are PLDs?'))

PLD Overview
Peer Learning Days (PLDs) are a time for you and your peers to ensure that each of you understands the concepts you've encountered in your projects, as well as a time for everyone to collectively grow in technical, professional, and soft skills. During PLD, you will collaboratively review prior projects with a group of cohort peers.
PLD Basics
PLDs are mandatory on-site days from 9:00 AM to 3:00 PM. If you cannot be present or on time, you must use a PTO. 
No laptops, tablets, or screens are allowed until all tasks have been whiteboarded and understood by the entirety of your group. This time is for whiteboarding, dialogue, and active peer collaboration. After this, you may return to computers with each other to pair or group program. 
Peer Learning Days are not about sharing solutions. This doesn't empower peers with the ability to solve problems themselves! Peer learning is when you share your thought process, whether through conversation, whiteboarding, debugging, or li

## 4. Multi-reference Question Answering

In [None]:
def question_answer(corpus_path):
    """
    answers questions from multiple reference texts:
    corpus_path is the path to the corpus of reference documents
    """
    model = hub.load("https://tfhub.dev/see--/bert-uncased-tf2-qa/1")
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    docs = []
    for doc in os.listdir(corpus_path):
      if doc[-3:] == ".md":
        with open(zendesk_articles + "/" + doc) as f:
          docs.append(f.read())
    univ_sent_enc = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
    embeddings = univ_sent_enc(docs)

    while True:
      print("Q: ", end="")
      inpt = input()      
      if inpt.lower() in ["exit", "quit", "goodbye", "bye"]:
        print("A: Goodbye")
        break
      

      new_embedding = univ_sent_enc([inpt])
      cosine_sims = []
      for out in embeddings:
          cosine_sims.append(np.dot(new_embedding, out)/(np.linalg.norm(new_embedding)*np.linalg.norm(out)))
      docnum = np.array(cosine_sims).argmax()
      paragraph_tokens = tokenizer.tokenize(docs[docnum])
      question_tokens = tokenizer.tokenize(inpt)
      tokens = ['[CLS]'] + question_tokens + ['[SEP]'] + paragraph_tokens + ['[SEP]']
      input_word_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_mask = [1] * len(input_word_ids)
      input_type_ids = [0] * (1 + len(question_tokens) + 1) + [1] * (len(paragraph_tokens) + 1)
      input_word_ids, input_mask, input_type_ids = map(lambda t: tf.expand_dims(
          tf.convert_to_tensor(t, dtype=tf.int32), 0), (input_word_ids, input_mask, input_type_ids))
      outputs = model([input_word_ids, input_mask, input_type_ids])
      # using `[1:]` will enforce an answer. `outputs[0][0][0]` is the ignored '[CLS]' token logit
      short_start = tf.argmax(outputs[0][0][1:]) + 1
      short_end = tf.argmax(outputs[1][0][1:]) + 1
      answer_tokens = tokens[short_start: short_end + 1]
      answer = tokenizer.convert_tokens_to_string(answer_tokens)
      # print(f'Question: {question}')
      # print(f'Answer: {answer}')
      if answer:
        print(f"A: {answer}")
      else:
        print("A: Sorry, I do not understand your question.")

In [None]:
question_answer(zendesk_articles)

Q: When are PLDs?
A: on - site days from 9 : 00 am to 3 : 00 pm
Q: What are Mock Interviews?
A: help you train for technical interviews
Q: What does PLD stand for?
A: peer learning days
Q: goodbye
A: Goodbye
