# Importing dependencies

In [1]:
!pip install ipython-autotime
%load_ext autotime
%matplotlib inline

Collecting ipython-autotime
  Downloading ipython-autotime-0.1.tar.bz2 (1.2 kB)
Building wheels for collected packages: ipython-autotime
  Building wheel for ipython-autotime (setup.py) ... [?25ldone
[?25h  Created wheel for ipython-autotime: filename=ipython_autotime-0.1-py3-none-any.whl size=1830 sha256=75dbd47341bfb840a3d27e8b6e8a93de6b6a7628fd77d2a8c7cb3cd25ff1add2
  Stored in directory: /root/.cache/pip/wheels/65/56/4a/4b967e4b9b62bd9d8d7ca789bba648c702d705487f28845bb2
Successfully built ipython-autotime
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import nltk 
import numpy as np
import re, unicodedata
from nltk.stem import wordnet 
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk import pos_tag 
from sklearn.metrics import pairwise_distances 
from nltk import word_tokenize 
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm

time: 1.73 s


# Reading Data

In [22]:
data_path = '/kaggle/input/cloud-counselage-qa-data/'

data = pd.read_excel(data_path + 'faq_data.xlsx')
data.ffill(axis = 0,inplace=True)
data.head()

Unnamed: 0,question,response_text
0,I am not able to access my Bitrix24 account?,Go to https://cloudcounselage24.bitrix24.com/ ...
1,What is the job profile? Will we be able to wo...,Your job profile is 'Technology - Intern'; if ...
2,How many workgroups will an intern be a part of?,Every intern should be a part of 2 workgroups....
3,How many workgroups should I be in?,Every intern should be a part of 2 workgroups....
4,Not able to access the LP1 page with my token,Please watch the videos shared with the invite...


time: 136 ms


# Helper Functions

In [23]:
# Stopword list
pattern = re.compile(r'\b('+r'|'.join(stopwords.words('english'))+r')\b\s*')

# @cuda.jit(device=True)
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# @tf.function()
def clean_text(text):
    text = unicode_to_ascii(str(text).lower().strip())
    
    # creating a space between a word and the punctuation following it
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    
    # replacing all the stopwords
    text = pattern.sub('',text)
    
    # removes all the punctuations
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    
    text = text.strip()
    
    # adding a start and an end token to the sentence so that the model know when to start and stop predicting.
#     text = '<start> ' + text + ' <end>'
    
    return text

clean_text_vect = np.vectorize(clean_text)

time: 5.01 ms


In [24]:
def chunk_clean(array,chunk_size=256):
    cleaned_array = []
    
    for i in tqdm(range(0, len(array), chunk_size)):
        text_chunk = clean_text_vect(array[i:i+chunk_size])
        cleaned_array.extend(text_chunk)

    return np.array(cleaned_array)

time: 1.92 ms


In [25]:
lema=wordnet.WordNetLemmatizer()

def text_normalization(text): 
    tokens=nltk.word_tokenize(text)     
    tags_list=pos_tag(tokens,tagset=None) 

    lema_words=[] 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
            
        lema_token=lema.lemmatize(token,pos_val) 
        lema_words.append(lema_token) 
    
    return " ".join(lema_words)

text_norm_vect = np.vectorize(text_normalization)

time: 3.15 ms


In [26]:
def chunk_text_normalize(array,chunk_size=256):
    norm_array = []
    
    for i in tqdm(range(0, len(array), chunk_size)):
        text_chunk = text_norm_vect(array[i:i+chunk_size])
        norm_array.extend(text_chunk)

    return np.array(norm_array)

time: 1.67 ms


# Data Preparation

In [27]:
data['cleaned_data'] = chunk_clean(data.question.values)
data['norm_data'] = chunk_text_normalize(data.cleaned_data.values)
data.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,question,response_text,cleaned_data,norm_data
0,I am not able to access my Bitrix24 account?,Go to https://cloudcounselage24.bitrix24.com/ ...,able access bitrix24 account,able access bitrix24 account
1,What is the job profile? Will we be able to wo...,Your job profile is 'Technology - Intern'; if ...,job profile able work tech chosen internship,job profile able work tech choose internship
2,How many workgroups will an intern be a part of?,Every intern should be a part of 2 workgroups....,many workgroups intern part,many workgroups intern part
3,How many workgroups should I be in?,Every intern should be a part of 2 workgroups....,many workgroups,many workgroups
4,Not able to access the LP1 page with my token,Please watch the videos shared with the invite...,able access lp1 page token,able access lp1 page token


time: 933 ms


Vectorizing the data

In [28]:
word_vectorizer = TfidfVectorizer()
faq_data = word_vectorizer.fit_transform(data.norm_data.values).toarray() 

time: 19.8 ms


In [29]:
faq_data_features=pd.DataFrame(faq_data,columns=word_vectorizer.get_feature_names()) 
vocab_text = list(faq_data_features.columns)
faq_data_features.head()

Unnamed: 0,14,21,22nd,2nd,able,abort,absolutely,abysmal,accept,access,...,yap,ye,yea,yeah,year,yeh,yep,yes,yet,yup
0,0.0,0.0,0.0,0.0,0.455634,0.0,0.0,0.0,0.0,0.455634,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.356341,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.451964,0.0,0.0,0.0,0.0,0.451964,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 35.5 ms


In [30]:
# word_vectorizer2 = TfidfVectorizer(vocabulary=vocab_text)

dummy = 'did not get appointment letter'
dummy = text_normalization(clean_text(dummy))
dummy_vect = word_vectorizer.transform([dummy]).toarray()

cosine_similarity=1-pairwise_distances(faq_data_features,dummy_vect,metric='cosine')
data['response_text'].loc[cosine_similarity.argmax()]

"If you had not attended the live induction and have registered in the pre-recorded session after 4 PM, 31st March. Then you'll get your joining letter by 30th April 2020. If you have otherwise, please a mail to hrsupport@cloudcounselage.in or ping 'Cloud Counselage HR' in Bitrix24."

time: 21.4 ms


In [31]:
def predict_answer(text):
    text = text_normalization(clean_text(text))
    text = word_vectorizer.transform([text]).toarray()

    cosine_similarity=1-pairwise_distances(faq_data_features,text,metric='cosine')
    output = data['response_text'].loc[cosine_similarity.argmax()]
    
    return output

time: 1.47 ms


In [32]:
def chat():
    print('NATUKAKA: Welcome User, I am a chatbot assistant\n')
    while True:
        text = str(input('USER: '))
        
        if text=='quit':
            print('NATUKAKA: ','Bye, See you again soon','\n')
            break
            
        response = predict_answer(text)
        print('NATUKAKA: ',response,'\n')    

time: 1.42 ms


# Testing the chatbot

In [34]:
chat()

NATUKAKA: Welcome User, I am a chatbot assistant

USER: hello
NATUKAKA:  Howdy. 

USER: Login issues with training
NATUKAKA:  Yes, you need to register for every module of training. Some of you are facing login issues, we have kept the training visible without login. Even then, to post a comment and give a quiz you'll have to login. In case you face difficulty to do so, please try to perform your quiz or post a comment by using a different browser or incognito mode. 

USER: I'm not able to log in for the LP1 task
NATUKAKA:  Ensure that you're using the right token. 

USER: How do I start my internship?
NATUKAKA:  Hope you and your family are in the best of health given the current scenario and you are making the most of your time through the 'Internship Program' by Cloud Counselage. 
According to the schedule of this internship program, some of you must have completed steps 1 and 2 of your internship and some of you must be in the process of completing step 2. 

Please note that as per