# Creation of Intents

#### Loading Data

In [84]:
import pandas as pd

In [85]:
pd.set_option('display.max_colwidth', None)

In [86]:
data = pd.read_csv("question_responce.csv").drop(columns='Unnamed: 0')
data = data.drop_duplicates()
data.head(5)

Unnamed: 0,responce,question
0,@115904 We'll be sure to pass along your kind words! #AATeam,@AmericanAir Erica on the lax team is amazing give her a raise ty
1,@115904 Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP
2,"@115905 Aww, that's definitely a future pilot in the making! #HappyHalloween",Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1
3,@115906 We're sorry for your frustration.,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program."
4,@115909 We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!"


#### Preprocessing 

**Steps :**

- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- All stopwords are removed.
- All words starting by '@' and '#' are removed
- Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- bags of words are created

In [87]:
import gensim
import nltk
import re
import numpy as np
import json
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import *

lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = SnowballStemmer("english")

np.random.seed(2018)
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('words')
words = set(nltk.corpus.words.words())

Extra stopwords

In [88]:
extra_words = ['flight','fly','plane','thanks', 'thank','get','please']
full_stopwords = STOPWORDS.union(set(extra_words))

In [89]:
stop_words = stopwords.words('english')
stop_words.extend(extra_words)

#### Preprocessing

Remove unecessary items int the text

In [90]:
def question_preprocess(text):
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    text = " ".join(filter(lambda x:x[0:4]!='http', text.split()))
    return text

In [91]:
data['question_clean'] = data['question'].apply(question_preprocess)
data['question_clean']

0                                                                          Erica on the lax team is amazing give her a raise ty
1                                                 Could you have someone on your lax team available to guide me to my gate ASAP
2                                                                              Ben Tennyson and an American Airlines pilot. 🎃 …
3       Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.
4                                         Thank you, for playing and for having great flight attendants on my flight back home!
                                                                 ...                                                           
1847                                and have nailed in the transatlantic WiFi service. I am able to join my daily scrum onboard
1848                                                                                Average price of tic

In [92]:
data['question_preproc'] = data['question_clean'].apply(gensim.utils.simple_preprocess)
data['question_preproc']

0                                                                             [erica, on, the, lax, team, is, amazing, give, her, raise, ty]
1                                             [could, you, have, someone, on, your, lax, team, available, to, guide, me, to, my, gate, asap]
2                                                                                        [ben, tennyson, and, an, american, airlines, pilot]
3       [right, but, earned, those, also, shouldn, have, to, pay, to, pass, them, to, my, own, spouse, you, need, to, change, your, program]
4                                        [thank, you, for, playing, and, for, having, great, flight, attendants, on, my, flight, back, home]
                                                                        ...                                                                 
1847                               [and, have, nailed, in, the, transatlantic, wifi, service, am, able, to, join, my, daily, scrum, onboard]
1848         

In [93]:
def remove_stopwords(text):
    return [[w for w in simple_preprocess(str(doc)) if w not in stop_words] for doc in text]

In [94]:
data["question_stop"] = remove_stopwords(data['question_preproc'])
data["question_stop"] = data["question_stop"].apply(" ".join)
data["question_stop"]

0                                                erica lax team amazing give raise ty
1                                    could someone lax team available guide gate asap
2                                                ben tennyson american airlines pilot
3                               right earned also pay pass spouse need change program
4                                                  playing great attendants back home
                                            ...                                      
1847                  nailed transatlantic wifi service able join daily scrum onboard
1848                                                     average price ticket one way
1849          really annoyed month since damaged bag claim never heard back done told
1850    terrible service wait ages trying call number almost two months gone response
1851                         charges patrons change flights every time airport closes
Name: question_stop, Length: 1852, dtype: object

Lemmatize

In [95]:
#lemmatize
def lemmat(text):
    result = []
    text
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(WordNetLemmatizer().lemmatize(token, pos='v'))
    return result


In [96]:
data ["question_lem"] = data["question_stop"].apply(lemmat)

In [97]:
data[["question","question_lem"]]

Unnamed: 0,question,question_lem
0,@AmericanAir Erica on the lax team is amazing give her a raise ty,"[erica, lax, team, amaze, give, raise]"
1,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP,"[could, someone, lax, team, available, guide, gate, asap]"
2,Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1,"[ben, tennyson, american, airlines, pilot]"
3,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.","[right, earn, also, pay, pass, spouse, need, change, program]"
4,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!","[play, great, attendants, back, home]"
...,...,...
1847,@AmericanAir and @172 have nailed in the transatlantic WiFi service. I am able to join my @172377 daily scrum onboard,"[nail, transatlantic, wifi, service, able, join, daily, scrum, onboard]"
1848,@AmericanAir Average price of ticket out: $2500 one way.,"[average, price, ticket, one, way]"
1849,@AmericanAir Really annoyed been over a month since my damaged bag claim never heard back! done as told...,"[really, annoy, month, since, damage, bag, claim, never, hear, back, do, tell]"
1850,@AmericanAir terrible service wait ages trying to call that number almost two months gone no response,"[terrible, service, wait, age, try, call, number, almost, two, months, go, response]"


#### Making bigrams and trigrams

In [98]:
bigram = gensim.models.Phrases(data["question_lem"], min_count=3, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

#example
for i in range(15,24):
    print(bigram_mod[data["question_lem"][i]])

['god', 'go', 'ten', 'minutes']
['give', 'grief', 'things', 'poorly', 'credit', 'guacamole', 'admirals_club', 'outstanding', 'addition']
['aware', 'fit', 'overhead', 'return_trip', 'luggage', 'go', 'pre', 'security', 'gate', 'chck', 'empty', 'overheads', 'flt']
['despite', 'fly', 'ewr', 'force', 'mia', 'check', 'guitar', 'amp', 'charge', 'gate', 'security', 'flt', 'full']
['fifteen', 'empty', 'business_class', 'seat', 'one', 'person', 'get', 'upgrade']
['still_wait', 'compensate', 'crap', 'put', 'honeymoon', 'almost', 'year', 'ago']
['really', 'hour', 'international', 'seat', 'back', 'entertainment', 'screen', 'mon']
['still', 'rather', 'screen', 'though', 'rather', 'cause', 'neck', 'problems', 'look', 'tablet']
['say', 'attenants', 'bdl', 'ord', 'awesome', 'gate_agents', 'bdl', 'also', 'need', 'shout', 'awesome', 'customer_service', 'treat', 'nonrev', 'well']


In [99]:
def bigrams(text):
    return [bigram_mod[doc] for doc in text]

In [102]:
data["question_lem_bigrams"] = bigrams(data["question_lem"])

In [103]:
data = data[['responce', 'question','question_lem_bigrams']]

#### prev preprocessing

In [61]:
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    text = " ".join(filter(lambda x:x[0:4]!='http', text.split()))
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) \
        if w.lower() in words or not w.isalpha())
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(lemmatize(token))
    return result

In [62]:
data['question_prepro'] = data['question'].apply(preprocess)
data['question_prepro']

0                         [lax, team, amaze, give, raise]
1       [could, someone, lax, team, available, guide, ...
2                                            [ben, pilot]
3       [right, also, pay, pass, spouse, need, change,...
4                                     [great, back, home]
                              ...                        
1847    [transatlantic, service, able, join, daily, sc...
1848                   [average, price, ticket, one, way]
1849    [really, month, since, bag, claim, never, back...
1850    [terrible, service, wait, try, call, number, a...
1851                       [change, every, time, airport]
Name: question_prepro, Length: 1852, dtype: object

We need to clean the responce too

In [None]:
def response_preprocess(text):
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    return text

In [None]:
data['responce'] = data['responce'].apply(response_preprocess)
data['responce']

0                                                                                                       We'll be sure to pass along your kind words!
1                               Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.
2                                                                                               Aww, that's definitely a future pilot in the making!
3                                                                                                                  We're sorry for your frustration.
4                                                         We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.
                                                                            ...                                                                     
1847                           We know staying connected is important, why not take your office to 35k fee

Drop empty questions

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
data = data[data['question_prepro'].map(lambda d: len(d)) > 1]
data

Unnamed: 0,responce,question,question_prepro
0,We'll be sure to pass along your kind words!,@AmericanAir Erica on the lax team is amazing give her a raise ty,"[lax, team, amaze, give, raise]"
1,Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP,"[could, someone, lax, team, available, guide, gate]"
2,"Aww, that's definitely a future pilot in the making!",Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1,"[ben, pilot]"
3,We're sorry for your frustration.,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.","[right, also, pay, pass, spouse, need, change, program]"
4,We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!","[great, back, home]"
...,...,...,...
1847,"We know staying connected is important, why not take your office to 35k feet? We're so glad you're enjoying the WiFi!",@AmericanAir and @172 have nailed in the transatlantic WiFi service. I am able to join my @172377 daily scrum onboard,"[transatlantic, service, able, join, daily, scrum]"
1848,We've capped our fares for nonstop flights at $99 for Puerto Rico through the 8th of Oct. Book travel here: https://t.co/iJWiiSmxCO,@AmericanAir Average price of ticket out: $2500 one way.,"[average, price, ticket, one, way]"
1849,Please give our Baggage team a call at 800-866-4010 for updates that may be available.,@AmericanAir Really annoyed been over a month since my damaged bag claim never heard back! done as told...,"[really, month, since, bag, claim, never, back, do, tell]"
1850,"Our apologies for the hold. Our Central Baggage team will help at that number. If we can get an update, then please DM your bag file number.",@AmericanAir terrible service wait ages trying to call that number almost two months gone no response,"[terrible, service, wait, try, call, number, almost, two, go, response]"


#### Building LDA

In [104]:
import gensim.corpora as corpora
from pprint import pprint

In [105]:
id2word = corpora.Dictionary(data["question_lem_bigrams"])
id2word.filter_extremes(no_below=3)
corpus = [id2word.doc2bow(text) for text in data["question_lem_bigrams"]]

In [106]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=40)

In [107]:
lda_model[corpus[0]]

[(1, 0.8049957)]

display the topic's top 10 words

In [108]:
topwords = np.array(lda_model.show_topics(num_topics=40, num_words=10))
for i in enumerate(topwords[:,1]):
    print(i)

(0, '0.031*"check" + 0.014*"gate" + 0.011*"today" + 0.011*"give" + 0.011*"fly" + 0.008*"hey" + 0.008*"charlotte" + 0.008*"new" + 0.008*"yes" + 0.008*"try"')
(1, '0.021*"want" + 0.013*"return" + 0.013*"take" + 0.013*"travel" + 0.013*"give" + 0.010*"bag" + 0.010*"day" + 0.010*"change" + 0.010*"work" + 0.010*"know"')
(2, '0.022*"customer_service" + 0.021*"time" + 0.017*"would" + 0.017*"go" + 0.015*"book" + 0.013*"frequent_flyer" + 0.013*"chance" + 0.013*"know" + 0.013*"seat" + 0.013*"pay"')
(3, '0.041*"seat" + 0.016*"pay" + 0.013*"another" + 0.013*"fly" + 0.012*"say" + 0.011*"time" + 0.011*"get" + 0.010*"board" + 0.010*"help" + 0.009*"gate"')
(4, '0.030*"delay" + 0.024*"make" + 0.014*"get" + 0.011*"tell" + 0.011*"gate" + 0.011*"use" + 0.011*"minutes" + 0.011*"help" + 0.010*"today" + 0.010*"amp"')
(5, '0.017*"baggage" + 0.017*"miami" + 0.014*"could" + 0.014*"get" + 0.014*"make" + 0.014*"today" + 0.014*"still" + 0.010*"special" + 0.010*"amp" + 0.010*"ord"')
(6, '0.019*"check" + 0.017*"get" 

#### Try to extract topic from new text 

In [109]:
from operator import itemgetter

In [125]:
def get_highest_topic(text):
  text_cleaned = gensim.utils.simple_preprocess(question_preprocess(text))
  text_lem = bigram_mod[lemmat(" ".join([w for w in text_cleaned if w not in stop_words]))]

  result = lda_model[id2word.doc2bow(text_lem)]
  return max(result, key=itemgetter(1))[0]

get_highest_topic('@AmericanAir terrible service wait ages trying to call customer service almost two months gone no response')

19

In [None]:
'''
def get_highest_topic(text):
  text_cleaned = preprocess(text)
  result = lda_model[id2word.doc2bow(text_cleaned)]
  #print("Max topic topic : ", max(result, key=itemgetter(1))[0], 
  #     "\nWith : ", max(result, key=itemgetter(1))[1])
  return max(result, key=itemgetter(1))[0]

get_highest_topic('Ben Tennyson  Airlines pilot.')
'''


In [127]:
data['topic'] = data['question'].apply(get_highest_topic)
data.head(10)

Unnamed: 0,responce,question,question_lem_bigrams,topic
0,@115904 We'll be sure to pass along your kind words! #AATeam,@AmericanAir Erica on the lax team is amazing give her a raise ty,"[erica, lax, team, amaze, give, raise]",1
1,@115904 Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP,"[could, someone, lax, team, available, guide, gate, asap]",3
2,"@115905 Aww, that's definitely a future pilot in the making! #HappyHalloween",Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1,"[ben, tennyson, american_airlines, pilot]",19
3,@115906 We're sorry for your frustration.,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.","[right, earn, also, pay, pass, spouse, need, change, program]",11
4,@115909 We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!","[play, great, attendants, back, home]",29
5,@116142 We never want your experience to be anything less than perfect. We're sorry the Wi-Fi was slower than expected.,@AmericanAir's wifi makes Amtrak's wifi look pro. At least theirs is free. @5804 has decent wifi @ reasonable prices. Why not AA?,"[wifi, make, amtrak, wifi, look, pro, least, free, decent, wifi, reasonable, price]",26
6,"@116143 Thanks for the shout-out and for stopping by, Marc! We'll pass this on to our SFO Admirals Club managers.","Wonderful club! @americanair (@ American Airlines Admirals Club in San Francisco, CA) https://t.co/GWFwuGVgNA","[wonderful, club, american_airlines, admirals_club, san, francisco]",23
7,@116144 They'll be more than happy to walk you through it and help with the booking process.,"@AmericanAir already did...changed browsers, did all my techie tricks and still can't make a reservation after 1 hour!!","[already, change, browsers, techie, trick, still, make, reservation, hour]",5
8,@116144 We're sorry for the long wait. The next available agent will be with you as soon as possible.,@AmericanAir ........still....on....hold.....they made the booking. I am trying to pay.....so #frustrating.,"[still, hold, make, book, try, pay]",7
9,@116144 Fares sell in real time. We're sorry we weren't able to snag the lower fare for you.,@AmericanAir well now i am told the ticket cost is an additional $200/person to pay....because of the wait....omg,"[well, tell, ticket, cost, additional, person, pay, wait, omg]",17


#### Building Intents

In [128]:
intents = {"Greeting": ["Hello", "How are you doing?", "Greetings!", "How do you do?"],
          "Apology": ["No problem"],
          "Thanks": ["No problem", "You're welcome"],
          "Goodbye": ["It was nice speaking to you", "See you later", "Speak soon!"]
             }

In [129]:
def update_intents(df):
  for index, row in df.iterrows():
    if str(row['topic']) not in intents :
      intents[str(row['topic'])] = []
    if row['responce'] != "":
      intents[str(row['topic'])].append(str(row['responce']))

In [130]:
update_intents(data)

Save Intents in order to use it again

In [131]:
with open('intents.json', 'w') as fp:
    json.dump(intents, fp)

#### Doc2Vec

In [132]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [133]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(data['responce'])]
model = Doc2Vec(tagged_data, window=1, min_count=2, epochs = 200)

#### First Results 

In [134]:
def chatbot(text):
  result = model.docvecs.most_similar(positive=[model.infer_vector(preprocess(text))], topn=1850)
  topic = get_highest_topic('text')
  for l in result:
    if data['topic'].iloc[int(l[0])] == topic:
      return data['responce'].iloc[int(l[0])]

In [None]:
#Preexisting questions and responses
list_word = ["@AmericanAir what’s going on with flight 301 DTW &gt; DFW? Delayed over 3 hours. Will it be canceled?", 
             "@AmericanAir awful service, more than 1 hour delay due to logbook maintenance issues. Where is preventive maintenance?",
             "@AmericanAir Yes. Refund my plane ticket😡 with your racist ass workers",
             "@AmericanAir I'm aware. Fits in overhead.This was a return trip. Had same luggage going. This was pre-security. Gate chck, ok, but Empty overheads on flt"]

list_resp = ["Looks like it has a maintenance delay and it's currently scheduled to take off at 10:10p.",
             "We do many checks pre and post departure. Please share your flight number if we can provide an update.",
             "We don't tolerate discrimination of any kind. Please DM your record locator and contact details (phone and email)",
             "We'd like to share this feedback. Please DM your record locator."]

for q,r in zip(list_word, list_resp):
  print("QUESTION :", q)
  print("EXPECTED :", r)
  print("PREDICTED :", chatbot(q),"\n")
  


QUESTION : @AmericanAir what’s going on with flight 301 DTW &gt; DFW? Delayed over 3 hours. Will it be canceled?
EXPECTED : Looks like it has a maintenance delay and it's currently scheduled to take off at 10:10p.
PREDICTED : @125892 Please DM us the details of your experience. 

QUESTION : @AmericanAir awful service, more than 1 hour delay due to logbook maintenance issues. Where is preventive maintenance?
EXPECTED : We do many checks pre and post departure. Please share your flight number if we can provide an update.
PREDICTED : @152570 We're working hard to include power on most our planes and either inflight entertainment, or streaming capability, David. 

QUESTION : @AmericanAir Yes. Refund my plane ticket😡 with your racist ass workers
EXPECTED : We don't tolerate discrimination of any kind. Please DM your record locator and contact details (phone and email)
PREDICTED : @141341 Usually, First Class or full fare tickets allow you to make voluntary changes and it's even refundable w

  result = model.docvecs.most_similar(positive=[model.infer_vector(preprocess(text))], topn=1850)


In [135]:
question = input('Enter your question : ')
chatbot(question)

  result = model.docvecs.most_similar(positive=[model.infer_vector(preprocess(text))], topn=1850)


'@126770 The cost to buy miles is preset and the chart is available here, Cristen: https://t.co/9zLqAIrtdx'