In [None]:
!pip install sentence_similarity



In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("Cleaned_Questions.csv")
data

Unnamed: 0,ID,Question,Answer
0,0,How can I contact you?,If you have any questions when you apply for o...
1,1,What is the deadline to apply for your PhD and...,There is no specific deadline for the applicat...
2,2,What materials are required if I want to apply...,Please have the following documents ready befo...
3,3,How can my referees send their recommendation ...,When you submit your applications on line (see...
4,4,What is the GPA requirement?,Please find the GPA requirements from https://...
5,5,Is GRE required?,No.
6,6,(International student) What is the requiremen...,Please find the requirement on English exams f...
7,7,(English waiver) I am a US citizen now. Howeve...,"If you are an American citizen, there is no En..."
8,8,Am I eligible to apply if I do not have a Bach...,Yes. You will be assigned with deficiency cour...
9,9,"What are the starting dates of the Spring, Sum...",Please find the university calendar from https...


##Base Line Model (euc distance / cosine sim)

In [None]:

from sentence_similarity import sentence_similarity

similarityModel =sentence_similarity(model_name='distilbert-base-uncased', embedding_type='cls_token_embedding')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
testQ = "When do I register for calsses?"
#tuple with (class, its similarity)
mostSimilar =  (-1, float('inf'))

for i in range(len(data['Question'])):
  score = similarityModel.get_score(testQ, data['Question'][i], metric="euclidean")

  if (score < mostSimilar[1]):
    mostSimilar = (i, score)

print("Question mathched:")
print(data['Question'][mostSimilar[0]])

print()

print("Answer:")
print(data['Answer'][mostSimilar[0]])


Question mathched:
Where can I find my deficiency courses?

Answer:
Please talk with (or email) the department graduate advisor (the current graduate advisor is Dr. Huiping Cao).


# Second Model (avg between q and a)

In [None]:
testQ = "When do I register for calsses?"

scores = [0]*len(data['Question'])

qWeight = 0.8


for i in range(len(data['Question'])):
  scores[i] = (similarityModel.get_score(testQ, data['Question'][i], metric="cosine") * qWeight +
                similarityModel.get_score(testQ, data['Answer'][i], metric="cosine") * (1 - qWeight) )

#tuple with (class, similarity)
mostSimilar = (-1, float('inf'))
for i in range(len(scores)):
  if (scores[i] < mostSimilar[1]):
    mostSimilar = (i, scores[i])

print("Question mathched:")
print(data['Question'][mostSimilar[0]])

print()

print("Answer:")
print(data['Answer'][mostSimilar[0]])

Question mathched:
(International student) What is the requirement on my English (e.g., TOEFL/IELTS) scores? Can I waive my TOEFL/IELTS exams?

Answer:
Please find the requirement on English exams from https://isss.nmsu.edu/index-8/ . Note that students from some countries can have their TOEFL/IELTS exams waived. You can find the list of such countries from https://isss.nmsu.edu/index-8/ .


# Third (synonym comparision)



In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize  
from collections import OrderedDict
from nltk.stem import PorterStemmer
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#list of charactes and words to be removed after word tokenization
not_considered = ['.', ',', 'I', 'this', 'that', 'a', 'her', 'she', 'of', 'me', 'at', 'do', 'to', 'am', 'and', '?', 'the', "'", 's', "’", '“']


#function to get a list of synonyms of a word with some processing
def get_syns(word):
  ps = PorterStemmer()
  synonyms = []
    
  #get synonyms from wordnet
  for syn in wordnet.synsets(word):
      for l in syn.lemmas():
          synonyms.append(l.name())
    

  #stem all synonyms
  synonyms = [ps.stem(syn) for syn in synonyms]
  
  #remove all dublicates and multi-word synonyms
  synonyms = [syn.lower() for syn in synonyms]
  synonyms = list(OrderedDict.fromkeys(synonyms))
  synonyms = [syn for syn in synonyms if (syn.find("_") == -1)]


  return synonyms




#given a sentance, get a list of the synonyms for ALL words in the sentence as well as the words in the sentecen
def get_syn_bucket(sentence):
  ps = PorterStemmer()

  #toeknize the sentence
  wdList = word_tokenize(sentence)
  
  #lowercase all words
  wdList = [i.lower() for i in wdList]

  #remove transition words
  toRemove = not_considered
  wdList = [wd for wd in wdList if ((wd not in toRemove) and (wd.find("'") == -1))]


  #add words to syn bucket and add the synonyms of every word to the bucket
  synBuck = wdList.copy()
  synBuck = [ps.stem(syn) for syn in synBuck]

  for wd in wdList:
    synBuck += get_syns(wd)

  #remove dupes
  synBuck = list(OrderedDict.fromkeys(synBuck))

  

  return synBuck

In [None]:
data.reset_index(drop=True, inplace=True)

#temp column for the dataset
data['syn_bucket'] = [[]] * len(data)

#create a new column with synonym bucket
for i in range(len(data)):
  data.at[i, 'syn_bucket'] = get_syn_bucket(data.at[i, 'Question'])
data

ID             int64
Question      object
Answer        object
syn_bucket    object
dtype: object


Unnamed: 0,ID,Question,Answer,syn_bucket
0,0,How can I contact you?,If you have any questions when you apply for o...,"[how, can, contact, you, tin, buttock, nate, a..."
1,1,What is the deadline to apply for your PhD and...,There is no specific deadline for the applicat...,"[what, is, deadlin, appli, for, your, phd, mas..."
2,2,What materials are required if I want to apply...,Please have the following documents ready befo...,"[what, materi, are, requir, if, want, appli, f..."
3,3,How can my referees send their recommendation ...,When you submit your applications on line (see...,"[how, can, my, refere, send, their, recommend,..."
4,4,What is the GPA requirement?,Please find the GPA requirements from https://...,"[what, is, gpa, requir, be, exist, equal, cons..."
5,5,Is GRE required?,No.,"[is, gre, requir, be, exist, equal, constitut,..."
6,6,(International student) What is the requiremen...,Please find the requirement on English exams f...,"[(, intern, student, ), what, is, requir, on, ..."
7,7,(English waiver) I am a US citizen now. Howeve...,"If you are an American citizen, there is no En...","[(, english, waiver, ), us, citizen, now, howe..."
8,8,Am I eligible to apply if I do not have a Bach...,Yes. You will be assigned with deficiency cour...,"[am, elig, appli, if, not, have, bachelor, deg..."
9,9,"What are the starting dates of the Spring, Sum...",Please find the university calendar from https...,"[what, are, start, date, spring, summer, fall,..."


In [None]:
#Takes in a question and turns it into a tokenized list (removing the same words as the dataset)
def process_question(q):
  ps = PorterStemmer()

  wdList = word_tokenize(q)
  
  #remove transition words
  toRemove = not_considered

  #lower case
  wdList = [i.lower() for i in wdList]

  #removed transition words
  wdList = [wd for wd in wdList if ((wd not in toRemove) and (wd.find("'") == -1))]

  #stem all words
  wdList = [ps.stem(i) for i in wdList]

  return wdList


#takes in a question and returns the top 3 respective questions/answeres from the data set
def make_prediction(q, include_cosine = True, print_freqs = False):
  qList = process_question(q)

  pred = [0] * len(data)

  for i in range(len(data)):
    synBuck = data.at[i, 'syn_bucket']
    for wd in qList:
      if wd in synBuck:
        pred[i] += 1

  #max (value, index)
  max = -1
  mxList = []

  #count the number of common synonyms between questions
  for i in range(len(pred)):
    if pred[i] > max:
      max = pred[i]
      mxList = [i]
    elif pred[i] == max:
      mxList.append(i)

  if print_freqs:
    print(pred)


  #give first 3 related questions
  if not include_cosine:
    return mxList[:3]

  #use cosign similarity as a tie breaker to find top 3 questions

  pred = []

  for i in mxList:
    pred.append((similarityModel.get_score(q, data['Question'][i], metric="cosine"), i))


  pred = sorted(pred, reverse=True)


  #return top 3 questions
  return pred[:3]

  
      





In [None]:
#demo question. Makes prediction with question and returns the top answer

question = "When does the semester start?"


preds = make_prediction(que, include_cosine = True, print_freqs=True)

for pair in preds:
  print(data['Question'][pair[1]])
  print('\n')

[0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 1]
I got a D grade in a graduate course. How does it affect my GPA? Will it be counted as graduate credit?


What are the starting dates of the Spring, Summer, and Fall semesters?


When should I start taking PhD qualifying exam? and when will I get notifications to register for the PhD qualifying exams?




#Practical Demo

In [None]:
#get user input
question = input("Enter quesiton about NMSU Graduate School (or 'quit' to exit):\n")

#input driven loop
while (question != 'quit'):

  make predictions on input
  preds = make_prediction(question, include_cosine = True, print_freqs=False)

  print("Which of the following most closly relates to your question?.")

  #print the top associated questions
  cnt = 1
  for pair in preds:
    print(str(cnt) + ") " + data['Question'][pair[1]])
    print()
    cnt += 1

  #get user input of which question to give an answer to
  print("Respond with the question number ('1', '2', '3') that most closely relates to your question.")
  selected = input("If none of the prompted questions relate to yours, respond 'skip' and try rewording it:\n")

  print()

  #print the answer
  if (selected.isnumeric()):
    if (selected != 'skip'):
      print("This is the candid answer for the question you chose:")
      print(data['Answer'][preds[int(selected) - 1][1]])
        
  else:
    print("Unexpected Input")
    


  question = input("Enter question about NMSU Graduate School (or 'quit' to exit):\n")
  continue

Enter quesiton about NMSU Graduate School (or 'quit' to exit):
this s
Which of the following most closly relates to your question?.
1) What are the GA opportunities?

2) Is GRE required?

3) What is the GPA requirement?

Respond with the question number ('1', '2', '3') that most closely relates to your question.
If none of the prompted questions relate to yours, respond 'skip' and try rewording it:
blaabala

Unexpected Input


KeyboardInterrupt: ignored