In [74]:
import streamlit as st
import pandas as pd
import faiss
import gensim
import numpy as np
import re

def token(text):
  # Removing punctuations in string
  res = re.sub(r'[^\w\s]', '', text)
  # Replace all sequences of two or more spaces with a single space.
  res = re.sub(' +', ' ', res)
  # lower case
  res = res.lower()
  return res.strip().split(" ")


# load question-answer dataset 
df = pd.read_csv("../data/Question_Answer_Dataset_v1.2_S10.csv")

# load question and answer vectors generated from pre-trained word2vec model
vector = np.load("../data/vector.npz")
ques_vec = vector["x"]
ans_vec = vector["y"]

# load th trained word2vec model 
# Hint: You should use the word2vec model pre-trained with both question and answer sets.
trained_w2v = gensim.models.Word2Vec.load("../data/w2v-advanced.model")

# App title
#st.set_page_config(page_title="Word2vec Question and Answer Chatbot")

# Add header image 
#st.image("/data/header-chat-box.png")

# chat title 
#st.title("Word2vec Question and Answer Chatbot")

# Store generated responses
#if "messages" not in st.session_state.keys():
#    st.session_state.messages = [{"role": "assistant", "content": "How may I help you?"}]

# Display chat messagess
#for message in st.session_state.messages:
#    with st.chat_message(message["role"]):
#        st.write(message["content"])

# Function to generate the embedding for query question
def trained_sentence_vec(sent):
    # Filter out terms that are not in the vocabulary from the question sentence
    # Hint: Use model.wv to get the whole vocabulary
    qu_voc = [tm for tm in sent if tm in trained_w2v.wv]
    # Get the embedding of the characters
    emb = np.vstack([trained_w2v.wv[tm] for tm in qu_voc])
    # Calculate the vectors of each included word to get the vector of the question
    ave_vec = np.mean(emb, axis=0)
    return ave_vec

# Function to find the answer through vector search
### Hint ###
# Function inputs: qr_sentence, ques_vec, and ans_vec
# Function output: the index of the optimal answer
# Function goal: do vector search among both question and answer sets
###
def find_answer(qr_sentence, ques_vec):
    # use one query sentence to retrieve answer
    qr_sentence = gensim.utils.simple_preprocess(qr_sentence)
    #qr_sentence = token(qr_sentence)
    qr_sent_vec = trained_sentence_vec(qr_sentence)

    # perform vector search through similarity comparison
    # define the number of feature (vector) dimensions
    n_dim = ques_vec.shape[1]
    # define the number of pairs of question and answer
    n_q_a = ques_vec.shape[0]
    # define ques_vec as a numpy array that is a float of size 32 bits
    x = np.array(ques_vec).astype(np.float32)
    # reshape qr_sent_vec
    q = qr_sent_vec.reshape(1, -1)
    # build the faiss index, n_dim=size of vectors using faiss.index_factory with METRIC_INNER_PRODUCT parameter
    index = faiss.index_factory(x.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT)
	
    # add all questions into the faiss index
    faiss.normalize_L2(x)
    index.add(x)
	
    # add all answers into the faiss index
    #faiss.normalize_L2(y)
    #index.add(y)
	
    # do vector search for the query sentence
    # return similarity score and idx using index.search function
    faiss.normalize_L2(q)
    similarity, idx = index.search(q, k=index.ntotal)
    ans_idx = idx[0][0]
	
    # find out the optimal answer index
    # Hint: if ans_idx is over the number of question-answer pairs, we need to make a if-statement to 
    # return an answer index align with our question-answer dataset
    while ans_idx > ques_vec.shape[0]:
        ans_idx = idx - ques_vec.shape[0]
      
    return ans_idx

In [73]:
idx

NameError: name 'idx' is not defined

In [80]:
prompt = "Does the Hymenoptera order include ants?"
idx = find_answer(prompt,ques_vec)
df["Answer"][idx]

'yes'

In [79]:
df.head(20)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy,data/set4/a10
1,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy,data/set4/a10
2,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,data/set4/a10
3,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,data/set4/a10
4,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic...",medium,easy,data/set4/a10
5,Alessandro_Volta,What is the battery made by Alessandro Volta c...,The battery made by Volta is credited as the f...,medium,medium,data/set4/a10
6,Alessandro_Volta,Did Alessandro Volta die and retire in the sam...,Alessandro Volta retired and died in the same ...,hard,hard,data/set4/a10
7,Alessandro_Volta,When did Alessandro Volta improve and popular...,Alessandro Volta improved and popularized the ...,hard,hard,data/set4/a10
8,Alessandro_Volta,How long was Alessandro Volta a professor at t...,Alessandro Volta was a professor at the Univer...,hard,hard,data/set4/a10
9,Alessandro_Volta,Was Alessandro Volta an Egyptian?,no,easy,easy,data/set4/a10


In [67]:
test = gensim.utils.simple_preprocess(prompt)
test = trained_sentence_vec(test)
n_dim = ques_vec.shape[1]
n_q_a = ques_vec.shape[0]

x = np.array(ques_vec).astype(np.float32)
q = test.reshape(1, -1)

index = faiss.index_factory(x.shape[0], "Flat", faiss.METRIC_INNER_PRODUCT)
# add all questions into the faiss index
faiss.normalize_L2(x)
index.add(x.T)

