# Importing dependencies

In [1]:
!pip install ipython-autotime
%load_ext autotime
%matplotlib inline

Collecting ipython-autotime
  Downloading ipython-autotime-0.1.tar.bz2 (1.2 kB)
Building wheels for collected packages: ipython-autotime
  Building wheel for ipython-autotime (setup.py) ... [?25ldone
[?25h  Created wheel for ipython-autotime: filename=ipython_autotime-0.1-py3-none-any.whl size=1830 sha256=31a42bb9856f3c866dbecdb39b2e27da1ec2594f3d50c43f7fee9db589fa0ee5
  Stored in directory: /root/.cache/pip/wheels/65/56/4a/4b967e4b9b62bd9d8d7ca789bba648c702d705487f28845bb2
Successfully built ipython-autotime
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.1
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import nltk 
import numpy as np
import re, unicodedata
from nltk.stem import wordnet 
from sklearn.feature_extraction.text import TfidfVectorizer 
from nltk import pos_tag 
from sklearn.metrics import pairwise_distances 
from nltk import word_tokenize 
from nltk.corpus import stopwords 
from tqdm.notebook import tqdm
import random

time: 1.4 s


# Reading Data

In [3]:
data_path = '/kaggle/input/cloud-counselage-qa-data/'

data = pd.read_excel(data_path + 'faq_data.xlsx')
data.ffill(axis = 0,inplace=True)
data.head()

Unnamed: 0,question,response_text
0,I am not able to access my Bitrix24 account?,Go to https://cloudcounselage24.bitrix24.com/ ...
1,What is the job profile? Will we be able to wo...,Your job profile is 'Technology - Intern'; if ...
2,How many workgroups will an intern be a part of?,Every intern should be a part of 2 workgroups....
3,How many workgroups should I be in?,Every intern should be a part of 2 workgroups....
4,Not able to access the LP1 page with my token,Please watch the videos shared with the invite...


time: 408 ms


# Helper Functions

In [4]:
# Stopword list
pattern = re.compile(r'\b('+r'|'.join(stopwords.words('english'))+r')\b\s*')

# @cuda.jit(device=True)
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# @tf.function()
def clean_text(text):
    text = unicode_to_ascii(str(text).lower().strip())
    
    # creating a space between a word and the punctuation following it
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    
    # replacing all the stopwords
    text = pattern.sub('',text)
    
    # removes all the punctuations
    text = re.sub(r"[^a-zA-Z]+", " ", text)
    
    text = text.strip()
    
    # adding a start and an end token to the sentence so that the model know when to start and stop predicting.
#     text = '<start> ' + text + ' <end>'
    
    return text

clean_text_vect = np.vectorize(clean_text)

time: 13.8 ms


In [5]:
def chunk_clean(array,chunk_size=256):
    cleaned_array = []
    
    for i in tqdm(range(0, len(array), chunk_size)):
        text_chunk = clean_text_vect(array[i:i+chunk_size])
        cleaned_array.extend(text_chunk)

    return np.array(cleaned_array)

time: 1.11 ms


In [6]:
lema=wordnet.WordNetLemmatizer()

def text_normalization(text): 
    tokens=nltk.word_tokenize(text)     
    tags_list=pos_tag(tokens,tagset=None) 

    lema_words=[] 
    for token,pos_token in tags_list:
        if pos_token.startswith('V'):  # Verb
            pos_val='v'
        elif pos_token.startswith('J'): # Adjective
            pos_val='a'
        elif pos_token.startswith('R'): # Adverb
            pos_val='r'
        else:
            pos_val='n' # Noun
            
        lema_token=lema.lemmatize(token,pos_val) 
        lema_words.append(lema_token) 
    
    return " ".join(lema_words)

text_norm_vect = np.vectorize(text_normalization)

time: 3.54 ms


In [7]:
def chunk_text_normalize(array,chunk_size=256):
    norm_array = []
    
    for i in tqdm(range(0, len(array), chunk_size)):
        text_chunk = text_norm_vect(array[i:i+chunk_size])
        norm_array.extend(text_chunk)

    return np.array(norm_array)

time: 8.23 ms


# Data Preparation

In [8]:
data['cleaned_data'] = chunk_clean(data.question.values)
data['norm_data'] = chunk_text_normalize(data.cleaned_data.values)
data.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,question,response_text,cleaned_data,norm_data
0,I am not able to access my Bitrix24 account?,Go to https://cloudcounselage24.bitrix24.com/ ...,able access bitrix account,able access bitrix account
1,What is the job profile? Will we be able to wo...,Your job profile is 'Technology - Intern'; if ...,job profile able work tech chosen internship,job profile able work tech choose internship
2,How many workgroups will an intern be a part of?,Every intern should be a part of 2 workgroups....,many workgroups intern part,many workgroups intern part
3,How many workgroups should I be in?,Every intern should be a part of 2 workgroups....,many workgroups,many workgroups
4,Not able to access the LP1 page with my token,Please watch the videos shared with the invite...,able access lp page token,able access lp page token


time: 2.73 s


Vectorizing the data

In [9]:
word_vectorizer = TfidfVectorizer()
faq_data = word_vectorizer.fit_transform(data.norm_data.values).toarray() 

time: 35 ms


In [10]:
faq_data_features=pd.DataFrame(faq_data,columns=word_vectorizer.get_feature_names()) 
vocab_text = list(faq_data_features.columns)
faq_data_features.head()

Unnamed: 0,able,abort,absolutely,abysmal,accept,access,account,acknowledgment,actually,add,...,yap,ye,yea,yeah,year,yeh,yep,yes,yet,yup
0,0.455634,0.0,0.0,0.0,0.0,0.455634,0.555086,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.356341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.462346,0.0,0.0,0.0,0.0,0.462346,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 25 ms


In [11]:
dummy = 'did not get appointment letter'
dummy = text_normalization(clean_text(dummy))
dummy_vect = word_vectorizer.transform([dummy]).toarray()

cosine_similarity=1-pairwise_distances(faq_data_features,dummy_vect,metric='cosine')
data['response_text'].loc[cosine_similarity.argmax()]

"If you had not attended the live induction and have registered in the pre-recorded session after 4 PM, 31st March. Then you'll get your joining letter by 30th April 2020. If you have otherwise, please a mail to hrsupport@cloudcounselage.in or ping 'Cloud Counselage HR' in Bitrix24."

time: 65.7 ms


In [26]:
def predict_answer(text):
    text = text_normalization(clean_text(text))
    text = word_vectorizer.transform([text]).toarray()

    cosine_similarity=1-pairwise_distances(faq_data_features,text,metric='cosine')
    if max(cosine_similarity) >= 0.20:
        output = data['response_text'].loc[cosine_similarity.argmax()]
    else:
        not_known_statements = ['Once again','Try once more','Say once again','Try another way','I did not understand','I did not get that']
        output = random.choice(not_known_statements)
    
    return output

time: 1.65 ms


In [27]:
def chat():
    print('NATU: Welcome User, I am a chatbot assistant\n')
    while True:
        text = str(input('USER: '))
        
        if text=='quit':
            print('NATU: ','Bye, See you again soon','\n')
            break
            
        response = predict_answer(text)
        print('NATU: ',response,'\n')    

time: 7.7 ms


# Testing the chatbot

In [25]:
chat()

NATU: Welcome User, I am a chatbot assistant

USER: hi
NATU:  Hey! 

USER: heya
NATU:  Howdy. 

USER: jkvufgiqudgiuqw
NATU:  Say once again 

USER: goudgouqfo
NATU:  Try another way 

USER: iyfiufiougigougo
NATU:  Say once again 

USER: quit
NATU:  Bye, See you again soon 

time: 2min 19s


# THE END