In [48]:
import pandas as pd
import re
import json
from spellchecker import SpellChecker

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PPANGILINAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PPANGILINAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## greeting

In [49]:
df1 = pd.read_csv("./datasets_pre/Intents_greeting.csv")

with open('./datasets_pre/Intent.json') as f:
    data = json.load(f)
df2 = pd.DataFrame(data["intents"])
df2 = df2[['text', 'intent']]
df2 = df2.explode('text')
df2 = df2[(df2["intent"] == "Greeting") | (df2["intent"] == "CourtesyGreeting")]

additional_greetings = {"text":["Good morning", "Good afternoon", "Good evening", "hello", "morning", "mornin", "afternoon", "evening",  "It's nice to meet you", "Pleased to meet you", "nice to meet you", "it's nice to see you again", "nice to see you again", "it's nice to see you", "nice to see you", "nice to see u", "nice to meet u", "How have you been?", "How do you do?", "Hey", "Hey man", "Hi", "hello", "How's it going?", "How are you doing?", "how are you?", "What's up?", "What's new?", "What's going on?", "How's everything?"  ," How are things?","How's life?", "How's your day? " ,"How's your day going?", " Good to see you","Nice to see you", " Long time no see","It's been a while ", "Yo!", "Howdy!", " Sup?","Whazzup? ", " G'day mate!", " Hiya!", "Hello", "Bonjour", "Hola", "Salaam", "Guten tag", "Hello", "Sup", "Heyyy", "Ahoy!", "Hello stranger!", "Goodmorrow!", "What's crackin'?", "What's up buttercup?", "'Sup?","Wassup?","Wazzup?", "Hey there!", "Good day", "Namaste", "Ohayo", "Ni Hao", "Salaam"]}
df3 = pd.DataFrame(additional_greetings)
df3['intent'] = "greeting"

greeting_df = pd.concat([df1, df2, df3], ignore_index=True)
greeting_df["intent"] = "greeting"
greeting_df.head()

Unnamed: 0,text,intent
0,Good morning,greeting
1,Hey,greeting
2,Hey there,greeting
3,"Hey there, what's up?",greeting
4,what's up?,greeting


In [51]:
stopwords_eng = set(stopwords.words('english'))
negation_set = {'no', 'nor', 'not', 't', 'can', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',"mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"}
stopwords_eng = stopwords_eng - negation_set


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    try:
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)
    except:
        print(type(word))
        return wordnet.NOUN

def preprocess_text(text):

    """
    Time in seconds for each step:
    {'cleaning': 0.12903165817260742, 'spelling': 117.63137936592102, 'stopwords': 0.0, 'lemmatization': 0.6576428413391113}
    """

    #to remove punctuations only
    text = text.lower()
    text = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', '', text)
    
    #to remove punctuations and special characters (other languages)
    text = re.sub(r'[^\w\s]+', '', text) 

    word_tokens = word_tokenize(text)

    # #correct misspelled (english)    
    # spell = SpellChecker()
    # misspelled = spell.unknown(word_tokens)
    # word_tokens = [w if w not in misspelled else spell.correction(w) for w in word_tokens]

    #removing stopwords
    word_tokens = [w for w in word_tokens if not w in stopwords_eng]

    #lemmatization
    lemmatizer = WordNetLemmatizer()
    word_tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) if w is not None else "" for w in word_tokens]

    clean_text = ' '.join(word_tokens)
    return clean_text

In [52]:
greeting_df['text'] = greeting_df['text'].apply(preprocess_text)
greeting_df.head()

Unnamed: 0,text,intent
0,good morning,greeting
1,hey,greeting
2,hey,greeting
3,hey whats,greeting
4,whats,greeting


In [53]:
greeting_df.drop_duplicates(inplace=True)
greeting_df.dropna(inplace=True)
greeting_df = greeting_df.drop(greeting_df[greeting_df['text'] == ''].index)
greeting_df.reset_index(inplace=True, drop=True)
greeting_df.shape

(534, 2)

In [54]:
greeting_df.to_csv("./datasets_post/greeting.csv", index=False)

## live_agent

In [55]:
df1 = pd.read_csv("./datasets_pre/Intents_live_agent.csv")

df2 = pd.read_csv("./datasets_pre/Bitext_Sample_Customer_Service_Training_Dataset.csv")
df2.rename(columns={"utterance": "text"}, inplace=True)
df2 = df2[df2["intent"] == "contact_human_agent"][["text", "intent"]]

live_agent_df = pd.concat([df1, df2], ignore_index=True)
live_agent_df["intent"] = "live_agent"
live_agent_df.head()

Unnamed: 0,text,intent
0,I need to talk to a human.,live_agent
1,Get me a real person!,live_agent
2,Can I speak to someone?,live_agent
3,I want to chat with an agent.,live_agent
4,Connect me with a live rep now.,live_agent


In [56]:
live_agent_df['text'] = live_agent_df['text'].apply(preprocess_text)
live_agent_df.head()

Unnamed: 0,text,intent
0,need talk human,live_agent
1,get real person,live_agent
2,can speak someone,live_agent
3,want chat agent,live_agent
4,connect live rep,live_agent


In [57]:
live_agent_df.drop_duplicates(inplace=True)
live_agent_df.dropna(inplace=True)
live_agent_df = live_agent_df.drop(live_agent_df[live_agent_df['text'] == ''].index)
live_agent_df.reset_index(inplace=True, drop=True)
live_agent_df.shape

(628, 2)

In [58]:
live_agent_df.to_csv("./datasets_post/live_agent.csv", index=False)

## others

In [59]:
with open('./datasets_pre/Intent.json') as f:
    data = json.load(f)
df1 = pd.DataFrame(data["intents"])
df1 = df1[['text', 'intent']]
df1 = df1.explode('text')
df1 = df1[(df1["intent"] != "Greeting") & (df1["intent"] != "CourtesyGreeting")]


df2 = pd.read_csv("./datasets_pre/Bitext_Sample_Customer_Service_Training_Dataset.csv")
df2.rename(columns={"utterance": "text"}, inplace=True)
df2 = df2[df2["intent"] != "contact_human_agent"][["text", "intent"]]

others_df = pd.concat([df1, df2], ignore_index=True)
others_df["intent"] = "others"
others_df.head()

Unnamed: 0,text,intent
0,My user is Adam,others
1,This is Adam,others
2,I am Adam,others
3,It is Adam,others
4,My user is Bella,others


In [60]:
others_df['text'] = others_df['text'].apply(preprocess_text)
others_df.head()

Unnamed: 0,text,intent
0,user adam,others
1,adam,others
2,adam,others
3,adam,others
4,user bella,others


In [61]:
others_df.drop_duplicates(inplace=True)
others_df.dropna(inplace=True)
others_df = others_df.drop(others_df[others_df['text'] == ''].index)
others_df.reset_index(inplace=True, drop=True)
others_df.shape

(4545, 2)

In [62]:
others_df.to_csv("./datasets_post/others.csv", index=False)