In [16]:
from gensim.models import Word2Vec
from sklearn import linear_model
import gensim
import operator
import string
import random
import numpy as np
import pickle
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder
from nltk.stem import WordNetLemmatizer 
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances,euclidean_distances,jaccard_similarity_score

In [23]:
# Reading the DataFrame for response
df = pd.read_csv('dialog_preprocessed.csv')
df.head()

Unnamed: 0,context,response
0,tell me about your personality,Just think of me as the ace up your sleeve.
1,i want to know you good,I can help you work smarter instead of harder
2,define yourself,Just think of me as the ace up your sleeve.
3,describe yourself,Just think of me as the ace up your sleeve.
4,tell me about yourself,Just think of me as the ace up your sleeve.


In [19]:
# Reading for Pickles
model_vector = np.load('models/word2vec.npy')
model_vector

array([[ 0.183543  ,  0.51169   ,  0.141594  , ..., -0.08579808,
        -0.04257199,  0.725074  ],
       [ 0.14366301,  0.17370317,  0.1059045 , ..., -0.08443283,
        -0.06526467,  0.68887667],
       [-0.05459   , -0.0749    ,  0.08115001, ..., -0.12484501,
         0.3078965 ,  0.661035  ],
       ...,
       [ 0.6243306 , -0.24323   ,  0.13598   , ..., -0.42309999,
        -0.07090433,  0.10501   ],
       [ 0.34527499,  0.10816088,  0.00393126, ..., -0.308125  ,
        -0.02529137,  0.20571   ],
       [ 0.33720036,  0.1136714 ,  0.1968034 , ..., -0.16597879,
        -0.2729478 ,  0.00450979]])

In [20]:
# Loading GloVe for Converting Question Vector
word_vector_path = "glove.6B.50d.txt"
vector_dim = 50
word_vector = gensim.models.KeyedVectors.load_word2vec_format(word_vector_path, binary=False)

In [26]:
# Class for text normalization
class Normalize():
    def __init__(self, sentences):
        '''
        Takes a Dataframe as the input. ALl columns should contain text or atleast the ones you wish to preprocess
        '''
        if isinstance(sentences, pd.DataFrame):
            self.dataframe = sentences.copy()
        else:
            self.dataframe = pd.DataFrame([])
        
    
    def replace_characters(self, string, characters = '''[<>\[\/]-_'.?()]'''):
        '''
        Replace characters in a string given by `characters` parameter and returns the text with replaced string
        '''
        for char in characters:
            string = string.replace(char, '')    
        return string
    
    def lemmatize_string(self,sentence):
        '''
        Replace all words in a sentence by its lemma using Part-of-Speech Tags
        '''
        lemmas = []
        lemmatizer = WordNetLemmatizer()
        
        for word,tag in nltk.pos_tag(sentence.split()):
            if tag.startswith('N'): #Noun
                pos_character = 'n'
            elif tag.startswith('V'): #Verb
                pos_character = 'v'
            elif tag.startswith('J'): #Adjective
                pos_character = 'a'
            elif tag.startswith('R'): #Adverb
                pos_character = 'r'
            else:
                 pos_character = 'v'    
            lemma = lemmatizer.lemmatize(word, pos=pos_character)
            if pos_character == 'r' and lemma==word:
                lemma = lemmatizer.lemmatize(word, pos='a')
            lemmas.append(lemma)
        return ' '.join(lemmas)
    
    def lowercase(self, string):
        '''
        Convert each string to lowercase
        '''
        return string.lower()
    
    def remove_stopwords_string(self, sentence):
        '''
        Removes all stop words of the sentence using NLTK default English stopwords
        '''
        words = []
        stop_words = list(set(stopwords.words('english')))
        for word in nltk.word_tokenize(sentence):
            if word not in stop_words:
                words.append(word)
        return ' '.join(words)
    
    def lemmatize(self, columns, inplace=False):
        '''
        Lemmatize the columns of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.lemmatize_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    

        
    def remove_stopwords(self, columns, inplace=False):
        '''
        Removes the stopwords of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.remove_stopwords_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def remove_special_characters(self, columns, inplace=False):
        '''
        Removes the special characters of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.replace_characters)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def convert_to_lowercase(self, columns, inplace=False):
        '''
        Converts the sentences of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.lowercase)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def preprocess_string(self, string):
        '''
        Does the following steps
        1. Convert to lowercase
        2. Replace all special characters (default characters)
        3. Lemmatize all words by their POS tags
        '''
        lower = self.lowercase(string)
        replaced = self.replace_characters(lower)
        lemmatized = self.lemmatize_string(replaced)
        return lemmatized
    
    def preprocess(self, columns, inplace=False):
        '''
        Preprocesses the sentences of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.preprocess_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe

In [27]:
def vectorize_sentence(word2vec_dict, sentence):
    '''
    Function to calculate the average vector of a sentence.
    '''
    words = sentence.split()
    count = 0
    # Initialize vector with all zeroes
    vector = np.zeros(50)
    for word in words:
        # Add the word vector of each word to our final vector
        try:
            vector = vector + word2vec_dict[word]
            count+=1
        except KeyError: # If unknown word occurs, decrement the count
            count-=1
    if count==0: # Sometimes, count will be zero when every word in sentence is unknown, so then return vector
        return vector
    return vector/float(count) # Return mean of our vector

def get_response(question, threshold=0.5):
    '''
    Function to return response for a question
    '''
    question_vector = vectorize_sentence(word_vector, question)
    cosine_similarities = 1 - pairwise_distances([question_vector],model_vector, metric='cosine')[0]
    responses = df['response']
    dataframe_values = list(zip(cosine_similarities, responses))
    similarity_df = pd.DataFrame(dataframe_values, columns=['similarity', 'response'])
    thresholded_df = similarity_df[similarity_df['similarity'] > threshold]
    thresholded_df_sorted = thresholded_df.sort_values(by='similarity', ascending=False)
    response = thresholded_df_sorted.iloc[0]['response']
    return response

In [31]:
question = input('Type a question: ')
normalize = Normalize([])
question_preprocessed = normalize.preprocess_string(question)
get_response(question_preprocessed, threshold=0.5)

Type a question:  hello


'Hey!'