# **Imports**

In [None]:
!pip install wget
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install -U sentence-transformers
!pip install rake_nltk


%matplotlib inline
import torch
import numpy as np
import gensim
import math
import string
import json
import os
import matplotlib.pyplot as plt
import seaborn as sns
#import wget
import spacy
import scipy.stats
import nltk
import re
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from collections import defaultdict


from rake_nltk import Rake
import gensim.downloader as api

from transformers import pipeline
from transformers import BertTokenizer, BertModel


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words=stop_words.copy()
stop_words.remove('no')
stop_words.remove('not')

spacy_nlp = spacy.load("en_core_web_sm")

model = api.load("glove-wiki-gigaword-50")
qa_model = pipeline("question-answering")

In [None]:
!gdown 1VPzI4vDoANpTmb0NnfoxJjM_zChfKFuP #class 5
!gdown 1pHAQ3mbL1yOps2-Nzfzs891G4wtOXoKQ #class 6

Downloading...
From: https://drive.google.com/uc?id=1VPzI4vDoANpTmb0NnfoxJjM_zChfKFuP
To: /content/class_05.clean.txt
100% 503k/503k [00:00<00:00, 5.94MB/s]
Downloading...
From: https://drive.google.com/uc?id=1pHAQ3mbL1yOps2-Nzfzs891G4wtOXoKQ
To: /content/class_06.clean.txt
100% 1.32M/1.32M [00:00<00:00, 10.9MB/s]


In [None]:
#PATH
animal_path='/content/class_05.clean.txt'
artifact_path= '/content/class_06.clean.txt'

# **Download dataset**

In [None]:
#ANIMALS
num_questions = 44
num_items = 93

path=animal_path
is_animals = True

file = open(path, "r")
triples = [line.split('\t') for line in file.read().splitlines()]


In [None]:
#ARTIFACTS
num_questions = 55
num_items = 134

path=artifact_path
is_animals = False

file = open(path, "r")
triples = [line.split('\t') for line in file.read().splitlines()]


In [None]:
# items = []
# questions = []
# answers = []

# corpus = {}

# for i, triple in enumerate(triples):
#   item = triple[0]
#   question = triple[1]
#   answer = triple[2]

#   if item not in corpus:
#     items.append(item)
#     corpus[item] = {
#         'questions': [],
#         'answers': [],
#     }

#   corpus[item]['questions'].append(question)
#   corpus[item]['answers'].append(answer)

# for i, item in enumerate(items):
#   questions.append(corpus[item]['questions'])
#   answers.append(corpus[item]['questions'])


In [None]:
items = []
questions = []
answers = []

for i,sentence in enumerate(triples):

  if i % num_questions == 0:
    items.append(sentence[0])
    quest=[] #questions
    ans=[] #answers

  quest.append(sentence[1])
  ans.append(sentence[2])

  if i % num_questions == (num_questions-1):
    questions.append(quest)
    answers.append(ans)

# **Extract keywords Functions**

In [None]:
def preprocess_text(sentence, stopwords, lemmatize=True):
  doc = spacy_nlp(sentence)
  tokens = []

  for token in doc:
    token_text = token.lemma_ if lemmatize else token.text
    token_text = token_text.lower()

    if token_text in stopwords or token_text in string.punctuation:
      continue

    tokens.append(token_text)
  return tokens

In [None]:
def extract_best_similarity(item, question, answer, model, threshold=0.5):

  filtered_answer= preprocess_text(answer, stop_words, True)
  filtered_question= preprocess_text(question, stop_words, True)

  if filtered_answer[0]=='yes' or filtered_answer[0]=='no':
    return [filtered_answer[0]]

  if item in  model:
    final_answer=[]
    average_score=[]

    for i,token in enumerate(filtered_answer):

      if token not in model or token==item: # always discard item in answer
        average_score.append(0)
        continue

      if token.replace('.', '').isnumeric(): #always keep numbers
        average_score.append(1)
        continue

      similarities_answer_question = []

      for j, token_question in enumerate(filtered_question):
          if token_question in model and token in model:

            sim = model.similarity(token, token_question)
            similarities_answer_question.append(sim)

      avg = sum(similarities_answer_question) / len(similarities_answer_question)
      average_score.append(avg)


    avg_score=np.array(average_score)
    idx_avg = np.where(avg_score>threshold)[0]

    if idx_avg.shape[0]>0:
      for elem in idx_avg:
        final_answer.append(filtered_answer[elem])
      return final_answer

    else: return [filtered_answer[np.array(average_score).argmax()]]

  return filtered_answer

In [None]:
#get elements with yes and no

def extract_yes_no_answers(questions, answers, is_animals = False):
  yes_no_final_answers = []

  for i, item in enumerate(items):
      item_answers = []

      for j in range(num_questions):
          all_answer = []

          filtered_answer = preprocess_text(answers[i][j], stop_words, True)

          if filtered_answer[0]=='yes' or filtered_answer[0]=='no':
            item_answers.append(filtered_answer[0])

          elif 'not' in filtered_answer:
            item_answers.append('no')

          else:
            for token_ans in filtered_answer:
              if token_ans in item or item in token_ans:
                continue

              all_answer.append(token_ans)

            item_answers.append(' '.join(all_answer))


      yes_no_final_answers.append(item_answers)

  #ONLY FOR ANIMALS(rats)
  if is_animals:
    yes_no_final_answers[50][1] = 'are vertabrates'

  return yes_no_final_answers

In [None]:
#Generate answer from question-answer transformer
def get_qna(questions, answers):
  new_answers = []

  for i, item in enumerate(items):
      item_answers = []

      for j in range(num_questions):
        try:
          item_answers.append(qa_model(question = questions[i][j], context = answers[i][j])['answer'])
        except:
          print(f"error occured, i: {i}, j:{j}, question: {questions[i][j]} context: {answers[i][j]}")


      new_answers.append(item_answers)

  return new_answers


In [None]:
#get elements above threshold qa
def get_best_keywords(items, questions, answers, new_answers):
  best_keywords = []

  for i, item in enumerate(items):
      keywords_answers = []
      for j in range(num_questions):
          if len(new_answers[i][j].split()) <= 0.30 * len(answers[i][j].split()):
            best_similarity = new_answers[i][j].split()
          else:
            best_similarity = extract_best_similarity(item, questions[i][j], new_answers[i][j], model, threshold = 0.60)

          double_results = []

          for elem in best_similarity:
            if elem not in double_results:
              double_results.append(elem)
          keywords_answers.append(double_results)

      best_keywords.append(keywords_answers)

  return best_keywords

# **Compute best keywords**

In [None]:
yes_no_answers = extract_yes_no_answers(questions, answers, is_animals)
qna_answers = get_qna(questions, yes_no_answers)
best_keywords = get_best_keywords(items, questions, answers, qna_answers)


In [None]:
#print result
f = open('class_06.values.tsv', 'w+')

for i, item in enumerate(items):
    item_answers = []
    for j in range(num_questions):
      keyword = ''
      for idx, elem in enumerate(best_keywords[i][j]):
        if idx > 0:
          keyword += ';'
        keyword += elem
      f.write(f'{item} \t {questions[i][j]} \t {keyword} \n')


# **Transformer**

## Bert with all questions of a specific item

In [None]:
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")
model_bert = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

In [None]:
def compute_similarity(w1, w2):

  norm1 = np.linalg.norm(w1)
  norm2 = np.linalg.norm(w2)

  if norm1 == 0 or norm2 == 0:
    print("norms equal 0")
    return 0

  cos_sim = np.dot(w1, w2.T) / (norm1 * norm2)
  return cos_sim

In [None]:
def compute_mappings(split_question, split_answer):
  #mapp idx token to original question
  map_idx_question = []
  for i, token in enumerate(split_question):
    tokenized_word = tokenizer_bert.tokenize(token)
    # print(tokenized_word)
    map_idx_question += (np.ones(len(tokenized_word), dtype=int) * i).tolist()

  #mapp idx token to original question
  map_idx_answer = []
  for i, token in enumerate(split_answer):
    tokenized_word = tokenizer_bert.tokenize(token)
    # print(tokenized_word)
    map_idx_answer += (np.ones(len(tokenized_word), dtype=int) * i).tolist()

  return map_idx_question, map_idx_answer

In [None]:
def compute_bert_embeddings(question, answer):

  tokenized_question = tokenizer_bert(question, return_tensors="pt")
  tokenized_answer = tokenizer_bert(answer, return_tensors="pt")

  output_question = model_bert(**tokenized_question)
  output_answer = model_bert(**tokenized_answer)

  outputs_question_sum = torch.stack(output_question.hidden_states[-4:], dim=0).sum(dim=0)
  outputs_answer_sum = torch.stack(output_answer.hidden_states[-4:], dim=0).sum(dim=0)

  return outputs_question_sum, outputs_answer_sum

In [None]:
batches = []
mappings = []

for i in range(num_questions): #all the questions for a specific item

  preprocessed_question = preprocess_text(questions[0][i], stop_words, True)
  preprocessed_answer = preprocess_text(answers[0][i], stop_words, True)

  map_idx_question, map_idx_answer = compute_mappings(preprocessed_question, preprocessed_answer)
  mappings.append({
      'map_idx_question': map_idx_question,
      'map_idx_answer': map_idx_answer,
      'preprocessed_question': preprocessed_question,
      'preprocessed_answer': preprocessed_answer
  })
  batches.append(' '.join(preprocessed_question) + ' ' + ' '.join(preprocessed_answer))


In [None]:
tokenized = tokenizer_bert(batches, return_tensors="pt", padding=True)
output = model_bert(**tokenized)
# last_hidden_states = output.last_hidden_state

last_hidden_states = torch.stack(output.hidden_states[-4:], dim=0).sum(dim=0)

In [None]:
for idx in range(last_hidden_states.shape[0]):

  batch = last_hidden_states[idx]

  map_idx_question = mappings[idx]['map_idx_question']
  map_idx_answer = mappings[idx]['map_idx_answer']
  preprocessed_question = mappings[idx]['preprocessed_question']
  preprocessed_answer = mappings[idx]['preprocessed_answer']

  offset_question = 1
  offset_answer = len(map_idx_question) + 1

  avg_general = []

  for i, token_answer in enumerate(preprocessed_answer): #answers
    similarities = []

    for j, token_question in enumerate(preprocessed_question):

      start_idx_answer = map_idx_answer.index(i)
      start_idx_question = map_idx_question.index(j)

      count_i = map_idx_answer.count(i)
      count_j = map_idx_question.count(j)

      offset_start_answer = start_idx_answer + 1 + len(map_idx_question)
      offset_start_question = start_idx_question + 1

      offset_end_answer = offset_start_answer + count_i
      offset_end_question = offset_start_question + count_j

      phrase_token = tokenizer_bert.tokenize(batches[idx])
      phrase_token_cls = ['[CLS]'] + phrase_token

      answer_tok = last_hidden_states[idx, offset_start_answer: offset_end_answer]
      question_tok = last_hidden_states[idx, offset_start_question: offset_end_question]

      ans_tok = answer_tok.mean(dim=0) # word embedding for word i in answer
      quest_tok = question_tok.mean(dim=0) # word embedding for word i in answer

      similarity = compute_similarity(ans_tok.detach().numpy(), quest_tok.detach().numpy())
      similarities.append(similarity)

      # encoded = tokenizer_bert(batches[idx])
      # decoded = tokenizer_bert.decode(encoded['input_ids'])
      # decoded_split = decoded.split()
      # phrase_ = tokenizer_bert.tokenize(batches[idx])

      # print(phrase_token)
      # print(answer_tok.shape)
      # print(quest_tok.shape)
      # print(ans_tok.shape)
      # print(quest_tok.shape)
      # print(encoded)
      # print(decoded)
      # print(decoded_split)


      # print(f'answer_token: {phrase_token[offset_start_answer: offset_end_answer]}')
      # print(f'question_token: {phrase_token[offset_start_question: offset_end_question]}')

      print(phrase_token_cls)

      print(f'answer_token decoded: {phrase_token_cls[offset_start_answer: offset_end_answer]}')
      print(f'question_token decoded: {phrase_token_cls[offset_start_question: offset_end_question]}')
      print(f'similarity: {similarity}')
      print()
      # print()

      # break
    avg_general.append(np.array(similarities).mean())
    print("---------------------------------")
    print(f'similarity mean for {phrase_token_cls[offset_start_answer: offset_end_answer]} is: {np.array(similarities).mean()}')
    print("---------------------------------")
    print()
    print()

    # break
  break

