In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('popular', quiet=True) # for downloading packages
import warnings

# Ignore all warnings
warnings.simplefilter("ignore")

In [2]:
# Setting stop words from nltk library to english
stopwords = stopwords.words('english')

# Reading the dataset
df = pd.read_csv('../Dataset/train.csv')
df.head(1)

Unnamed: 0,question,answer
0,What type of organism is commonly used in prep...,mesophilic organisms. Mesophiles grow best in ...


In [3]:
# Tokenizing the words
def tokenizer(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

# Cleaning the input (removing punctuations, digits, and converting to lower case)
# may not be used
def clean_input(input):
    input = input.lower()
    input = input.translate(str.maketrans('', '', string.digits))
    input = input.translate(str.maketrans('', '', string.punctuation))
    input = input.strip()
    return input

# Solving the question using cosine similarity and tfidf vectorizer 
def solve(question):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords, lowercase=True)
    matrix = vectorizer.fit_transform(tuple(df['question']))
    question_vector = vectorizer.transform([question])
    cos_results = cosine_similarity(question_vector, matrix)
    index = np.argmax(cos_results, axis = None)
    return cos_results, index

# Getting the response 
def get_response(question):
    # question = clean_input(question)
    question = question.lower()
    similarity_results, index_res = solve(question)
    if similarity_results[0, index_res] < 0.65:
        return 'I\'m sorry, I\'m not able to understand. Could you please attempt again?'
    else:
        return df['answer'][index_res]

In [4]:
# This cell is for testing
# n = int(input("Enter the question number: "))
n = 3
question = df[['question']].iloc[n].values[0]
print("Question: ", question)
print("Answer: ",get_response(question))
# it gets the right answer

Question:  What is the least dangerous radioactive decay?
Answer:  alpha decay. All radioactive decay is dangerous to living things, but alpha decay is the least dangerous.


In [5]:
# here is an example of not finding an answer
print(get_response("hello there!"))

I'm sorry, I'm not able to understand. Could you please attempt again?


In [6]:
# trying to get an asnwer to a question that is not in the dataset
df_test = pd.read_csv('../Dataset/test.csv')
n_test = 1
question = df_test[['question']].iloc[n_test].values[0]
print("Question: ", question)
print("Answer: ",get_response(question))


Question:  What term in biotechnology means a genetically exact copy of an organism?
Answer:  I'm sorry, I'm not able to understand. Could you please attempt again?


In [7]:
# trying to get an asnwer to a made up question that is not in the dataset
print(get_response("what is the moon ?"))

spring. Spring tides occur during the new moon and full moon. The Sun and Moon are in a straight line either on the same side of Earth or on opposite sides. Their gravitational pull combines to cause very high and very low tides. Spring tides have the greatest tidal range.
