# Global Variables and Libraries

In [1]:
from gensim.models import Word2Vec
from sklearn import linear_model
import gensim
import operator
import string
import random
import numpy as np
import pickle
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import cohen_kappa_score, roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder
from nltk.stem import WordNetLemmatizer 
import nltk

# Data Preprocessing

In [2]:
df = pd.read_excel('dialog_talk_agent.xlsx') # Reading Dataframe
df

Unnamed: 0,Context,Text Response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,
3,Describe yourself,
4,tell me about yourself,
...,...,...
1586,,
1587,I'll be back in a few minutes,I'll be waiting.
1588,I'll be back,All right. I'll be here.
1589,I'll get back to you in a moment,Till next time.


In [3]:
rows = df.to_numpy() # Convert all row of Datafrane to numpy array
rows

array([['Tell me about your personality',
        'Just think of me as the ace up your sleeve.'],
       ['I want to know you better',
        'I can help you work smarter instead of harder'],
       ['Define yourself', nan],
       ...,
       ["I'll be back", "All right. I'll be here."],
       ["I'll get back to you in a moment", 'Till next time.'],
       ['I promise to come back', 'Okay. You know where to find me.']],
      dtype=object)

# Imputing Responses

In [4]:
# Initializing with empty list
chat_responses = []
updated_rows = []
for row in rows:
    context = row[0]
    response = row[1]
    if pd.isnull(context): # If Nan occurs, it means that context has changed
        chat_responses = [] # Reseting responses for new context
        continue
    if pd.isnull(response): # If response is empty, we fill it with random response of the current context
        response = random.choice(chat_responses) # Fill with random response
    else:
        chat_responses.append(response) # We append to the list of possible responses for current context
    updated_rows.append((context, response))
updated_rows[:10]

[('Tell me about your personality',
  'Just think of me as the ace up your sleeve.'),
 ('I want to know you better',
  'I can help you work smarter instead of harder'),
 ('Define yourself', 'I can help you work smarter instead of harder'),
 ('Describe yourself', 'Just think of me as the ace up your sleeve.'),
 ('tell me about yourself', 'Just think of me as the ace up your sleeve.'),
 ('all about you', 'I can help you work smarter instead of harder'),
 ('tell me some stuff about you',
  'Just think of me as the ace up your sleeve.'),
 ('talk some stuff about you',
  'I can help you work smarter instead of harder'),
 ('talk about yourself', 'Just think of me as the ace up your sleeve.'),
 ('about yourself', 'I can help you work smarter instead of harder')]

In [6]:
# This will be our new dataframe
df_updated = pd.DataFrame(updated_rows, columns = ['context', 'response'])
df_updated = df_updated.dropna()
df_updated

Unnamed: 0,context,response
0,Tell me about your personality,Just think of me as the ace up your sleeve.
1,I want to know you better,I can help you work smarter instead of harder
2,Define yourself,I can help you work smarter instead of harder
3,Describe yourself,Just think of me as the ace up your sleeve.
4,tell me about yourself,Just think of me as the ace up your sleeve.
...,...,...
1501,can we chat,Talking is what I do best.
1502,I'll be back in a few minutes,I'll be waiting.
1503,I'll be back,All right. I'll be here.
1504,I'll get back to you in a moment,Till next time.


In [7]:
# Class for text normalization
class Normalize():
    def __init__(self, sentences):
        '''
        Takes a Dataframe as the input. ALl columns should contain text or atleast the ones you wish to preprocess
        '''
        if isinstance(sentences, pd.DataFrame):
            self.dataframe = sentences.copy()
        else:
            self.dataframe = pd.DataFrame([])
        
    
    def replace_characters(self, string, characters = '''[<>\[\/]-_'.?()]'''):
        '''
        Replace characters in a string given by `characters` parameter and returns the text with replaced string
        '''
        for char in characters:
            string = string.replace(char, '')    
        return string
    
    def lemmatize_string(self,sentence):
        '''
        Replace all words in a sentence by its lemma using Part-of-Speech Tags
        '''
        lemmas = []
        lemmatizer = WordNetLemmatizer()
        
        for word,tag in nltk.pos_tag(sentence.split()):
            if tag.startswith('N'): #Noun
                pos_character = 'n'
            elif tag.startswith('V'): #Verb
                pos_character = 'v'
            elif tag.startswith('J'): #Adjective
                pos_character = 'a'
            elif tag.startswith('R'): #Adverb
                pos_character = 'r'
            else:
                 pos_character = 'v'    
            lemma = lemmatizer.lemmatize(word, pos=pos_character)
            if pos_character == 'r' and lemma==word:
                lemma = lemmatizer.lemmatize(word, pos='a')
            lemmas.append(lemma)
        return ' '.join(lemmas)
    
    def lowercase(self, string):
        '''
        Convert each string to lowercase
        '''
        return string.lower()
    
    def remove_stopwords_string(self, sentence):
        '''
        Removes all stop words of the sentence using NLTK default English stopwords
        '''
        words = []
        stop_words = list(set(stopwords.words('english')))
        for word in nltk.word_tokenize(sentence):
            if word not in stop_words:
                words.append(word)
        return ' '.join(words)
    
    def lemmatize(self, columns, inplace=False):
        '''
        Lemmatize the columns of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.lemmatize_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    

        
    def remove_stopwords(self, columns, inplace=False):
        '''
        Removes the stopwords of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.remove_stopwords_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def remove_special_characters(self, columns, inplace=False):
        '''
        Removes the special characters of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.replace_characters)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def convert_to_lowercase(self, columns, inplace=False):
        '''
        Converts the sentences of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.lowercase)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe
    
    def preprocess_string(self, string):
        '''
        Does the following steps
        1. Convert to lowercase
        2. Replace all special characters (default characters)
        3. Lemmatize all words by their POS tags
        '''
        lower = self.lowercase(string)
        replaced = self.replace_characters(lower)
        lemmatized = self.lemmatize_string(replaced)
        return lemmatized
    
    def preprocess(self, columns, inplace=False):
        '''
        Preprocesses the sentences of the dataframe and returns it
        `columns`: List of columns
        `inplace`: If True, do operation inplace and return None.
        '''
        dataframe = self.dataframe.copy()
        for column in columns:
            dataframe[column] = dataframe[column].apply(self.preprocess_string)
        if inplace:
            self.dataframe = dataframe.copy()
        return dataframe

In [8]:
# Initialize the class and preprocessing
normalize = Normalize(df_updated)
normalize.convert_to_lowercase(columns = ['context'], inplace=True)
normalize.remove_special_characters(columns = ['context'], inplace=True)
df_preprocessed = normalize.lemmatize(columns = ['context'], inplace=True)
df_preprocessed

Unnamed: 0,context,response
0,tell me about your personality,Just think of me as the ace up your sleeve.
1,i want to know you good,I can help you work smarter instead of harder
2,define yourself,I can help you work smarter instead of harder
3,describe yourself,Just think of me as the ace up your sleeve.
4,tell me about yourself,Just think of me as the ace up your sleeve.
...,...,...
1501,can we chat,Talking is what I do best.
1502,ill be back in a few minute,I'll be waiting.
1503,ill be back,All right. I'll be here.
1504,ill get back to you in a moment,Till next time.


In [159]:
# Same effect as previous cell. Calling one function
normalize = Normalize(df_updated)
df_preprocessed = normalize.preprocess(columns = ['context'], inplace=True)
df_preprocessed

Unnamed: 0,context,response
0,tell me about your personality,Just think of me as the ace up your sleeve.
1,i want to know you good,I can help you work smarter instead of harder
2,define yourself,Just think of me as the ace up your sleeve.
3,describe yourself,Just think of me as the ace up your sleeve.
4,tell me about yourself,Just think of me as the ace up your sleeve.
...,...,...
1501,can we chat,I'm always here to lend an ear.
1502,ill be back in a few minute,I'll be waiting.
1503,ill be back,All right. I'll be here.
1504,ill get back to you in a moment,Till next time.


In [160]:
# Convert to CSV for later use
df_preprocessed.to_csv('dialog_preprocessed.csv', index=False)