In [1]:
import numpy as np
import nltk
import string
import random

In [2]:
f = open("/content/UOP.txt",'r',errors = 'ignore')
text_doc = f.read()

In [3]:
text_doc = text_doc.lower() #convert text to lowercase

nltk.download('punkt')   #punkt tokenizer
nltk.download('wordnet') #wordnet dictionary
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
sentence_tokens = nltk.sent_tokenize(text_doc) #tokenize into sentences
word_tokens = nltk.word_tokenize(text_doc) #tokenize into words

In [None]:
sentence_tokens[4]

'the university of peradeniya hosts nine faculties, four postgraduate institutes (including the newly added postgraduate institute of medical sciences), 20 centres and units, 73 departments, and teaches about 12,000 students in the fields of medicine, agriculture, arts, science, engineering, dental sciences, veterinary medicine and animal science, management, and allied health science.'

In [None]:
word_tokens[100:112]

['crest',
 'motto',
 'sarvasva',
 'locanam',
 'sasthram',
 '(',
 'sanskrit',
 ')',
 'motto',
 'in',
 'english',
 'knowledge']

# *Text Processing*

In [5]:
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

In [6]:
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

In [7]:
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

## Greeting coommands for ChatBot

In [8]:
greet_user = ("hello", "hi","howdy","hey", "greetings", "sup", "whassup")

greet_bot = ["hi", "hey", "Welcome!", "hi there", "hello", "Good day!","Hello there!"]

In [9]:
def greet(sentence):
  for word in sentence.split():
    if word.lower() in greet_user:
      return random.choice(greet_bot)

#Intelligence of ChatBot

### 1. `TfidfVectorizer` in `sklearn`

The `TfidfVectorizer` is a feature extraction method provided by the `scikit-learn` library, used to convert a collection of raw text documents into a matrix of TF-IDF (Term Frequency-Inverse Document Frequency) features. This transformation helps to quantify the importance of a word within a document relative to its occurrence across a collection of documents. Essentially, it scales down the impact of more frequently occurring words (which might be common across documents, like 'the' or 'and') and highlights more significant words that help to distinguish a document's content. This is particularly useful for applications like document classification, clustering, and information retrieval, where capturing the unique essence of each document in a large corpus is crucial.

### 2. `cosine_similarity` in `sklearn`

 In the context of text processing, it is used to determine how similar two documents are, based on their vector representations (like those obtained from `TfidfVectorizer`). The cosine similarity value ranges from -1 to 1, where 1 indicates that the documents are identical in terms of their direction in the vector space (highly similar), 0 means they are orthogonal (completely different), and -1 indicates they are diametrically opposed. This metric is widely used in applications like document clustering, text mining, and recommendation systems, where understanding the degree of similarity between texts is important.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
def generate_response(user_response):
  robo1_response=''
  TfidVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english') #common English stop words (like 'the', 'is', etc.) are removed during tokenization.
  tfidf = TfidVec.fit_transform(sentence_tokens) #computes TF-IDF matrix
  vals = cosine_similarity(tfidf[-1], tfidf)  #computes cosine similarity between the TF-IDF vector of the user’s input and all other sentence vectors
  idx=vals.argsort()[0][-2]                   #sorts indices of the sentences based on cosine similarity scores. [0][-2] selects the second highest similarity,because the highest is the sentence itself
  flat = vals.flatten()                       #converts the 2D array of cosine similarity scores into a 1D
  flat.sort()                                 #sorts scores in ascending order
  req_tfidf = flat[-2]                        #selects the second highest similarity score(check similarity)
  if(req_tfidf==0):                           #0 means there is no meaningful similarity between user's input and any sentence
    robo1_response=robo1_response+"I am sorry! I don't understand you"
    return robo1_response
  else:
    robo1_response = robo1_response+sentence_tokens[idx]
    return robo1_response

##ChatBot Pipeline

In [None]:
flag=True
print("BOT: Hello! I'm PERA, Official ChatBot of University of Peradeniya. I am here to provide required information about the university. If you want to end the coversation, type bye!")
while(flag==True):
  user_response = input()
  user_response=user_response.lower()
  if(user_response!='bye!'):
    if(user_response=='thanks' or user_response=='thank you' ):
      print("PERA: No worries. How else I can assist you?")
    else:
      if(greet(user_response)!=None):
        print("PERA: "+greet(user_response))
      else:
        sentence_tokens.append(user_response)
        word_tokens=word_tokens+nltk.word_tokenize(user_response)
        final_words=list(set(word_tokens))
        print("PERA: ", end="")
        print(generate_response(user_response))
        sentence_tokens.remove(user_response)
  else:
    flag=False
    print("PERA: Goodbye! Have a Nice day ")


BOT: Hello! I'm PERA, Official ChatBot of University of Peradeniya. I am here to provide required information about the university. If you want to end the coversation, type bye!
