# Creation of Intents

#### Loading Data

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
data = pd.read_csv("question_responce.csv").drop(columns='Unnamed: 0')
data = data.drop_duplicates()
data.head(5)

Unnamed: 0,responce,question
0,@115904 We'll be sure to pass along your kind words! #AATeam,@AmericanAir Erica on the lax team is amazing give her a raise ty
1,@115904 Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP
2,"@115905 Aww, that's definitely a future pilot in the making! #HappyHalloween",Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1
3,@115906 We're sorry for your frustration.,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program."
4,@115909 We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!"


#### Preprocessing 

**Steps :**
- All words starting by '@' and '#' are removed
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- All stopwords are removed.
- Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
- bags of words are created

In [4]:
import gensim
import nltk
import numpy as np
import json
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

np.random.seed(2018)
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('words')
words = set(nltk.corpus.words.words())

Extra stopwords

In [5]:
extra_words = ['flight','flights','fly','flying','plane','thanks', 'thank','ty','get','please','plz']
stop_words = stopwords.words('english')
stop_words.extend(extra_words)

In [6]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Preprocessing

Remove unecessary items int the text

In [7]:
def question_preprocess(text):
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    text = " ".join(filter(lambda x:x[0:4]!='http', text.split()))
    return text

In [8]:
data['question_clean'] = data['question'].apply(question_preprocess)
data['question_clean']

0                                                                          Erica on the lax team is amazing give her a raise ty
1                                                 Could you have someone on your lax team available to guide me to my gate ASAP
2                                                                              Ben Tennyson and an American Airlines pilot. 🎃 …
3       Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.
4                                         Thank you, for playing and for having great flight attendants on my flight back home!
                                                                 ...                                                           
1847                                and have nailed in the transatlantic WiFi service. I am able to join my daily scrum onboard
1848                                                                                Average price of tic

In [9]:
data['question_preproc'] = data['question_clean'].apply(gensim.utils.simple_preprocess)
data['question_preproc']

0                                                                             [erica, on, the, lax, team, is, amazing, give, her, raise, ty]
1                                             [could, you, have, someone, on, your, lax, team, available, to, guide, me, to, my, gate, asap]
2                                                                                        [ben, tennyson, and, an, american, airlines, pilot]
3       [right, but, earned, those, also, shouldn, have, to, pay, to, pass, them, to, my, own, spouse, you, need, to, change, your, program]
4                                        [thank, you, for, playing, and, for, having, great, flight, attendants, on, my, flight, back, home]
                                                                        ...                                                                 
1847                               [and, have, nailed, in, the, transatlantic, wifi, service, am, able, to, join, my, daily, scrum, onboard]
1848         

In [10]:
def remove_stopwords(text):
    return [[w for w in simple_preprocess(str(doc)) if w not in stop_words] for doc in text]

In [11]:
data["question_stop"] = remove_stopwords(data['question_preproc'])
data["question_stop"] = data["question_stop"].apply(" ".join)
data["question_stop"]

0                                                   erica lax team amazing give raise
1                                    could someone lax team available guide gate asap
2                                                ben tennyson american airlines pilot
3                               right earned also pay pass spouse need change program
4                                                  playing great attendants back home
                                            ...                                      
1847                  nailed transatlantic wifi service able join daily scrum onboard
1848                                                     average price ticket one way
1849          really annoyed month since damaged bag claim never heard back done told
1850    terrible service wait ages trying call number almost two months gone response
1851                                 charges patrons change every time airport closes
Name: question_stop, Length: 1852, dtype: object

Lemmatize

In [12]:
#lemmatize with POS tag = V
def lemmat(text):
    result = []
    text
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 2:
            result.append(WordNetLemmatizer().lemmatize(token, pos='v'))
    return result


In [13]:
data ["question_lem"] = data["question_stop"].apply(lemmat)

In [14]:
data[["question","question_lem"]]

Unnamed: 0,question,question_lem
0,@AmericanAir Erica on the lax team is amazing give her a raise ty,"[erica, lax, team, amaze, give, raise]"
1,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP,"[could, someone, lax, team, available, guide, gate, asap]"
2,Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1,"[ben, tennyson, american, airlines, pilot]"
3,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.","[right, earn, also, pay, pass, spouse, need, change, program]"
4,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!","[play, great, attendants, back, home]"
...,...,...
1847,@AmericanAir and @172 have nailed in the transatlantic WiFi service. I am able to join my @172377 daily scrum onboard,"[nail, transatlantic, wifi, service, able, join, daily, scrum, onboard]"
1848,@AmericanAir Average price of ticket out: $2500 one way.,"[average, price, ticket, one, way]"
1849,@AmericanAir Really annoyed been over a month since my damaged bag claim never heard back! done as told...,"[really, annoy, month, since, damage, bag, claim, never, hear, back, do, tell]"
1850,@AmericanAir terrible service wait ages trying to call that number almost two months gone no response,"[terrible, service, wait, age, try, call, number, almost, two, months, go, response]"


#### Making bigrams and trigrams

In [15]:
bigram = gensim.models.Phrases(data["question_lem"], min_count=3, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

#example
for i in range(15,24):
    print(bigram_mod[data["question_lem"][i]])

# we obtain coherent bigrams as customer_service, business_class, etc..

['god', 'go', 'ten', 'minutes']
['give', 'grief', 'things', 'poorly', 'credit', 'guacamole', 'admirals_club', 'outstanding', 'addition']
['aware', 'fit', 'overhead', 'return_trip', 'luggage', 'go', 'pre', 'security', 'gate', 'chck', 'empty', 'overheads', 'flt']
['despite', 'ewr', 'force', 'mia', 'check', 'guitar', 'amp', 'charge', 'gate', 'security', 'flt', 'full']
['fifteen', 'empty', 'business_class', 'seat', 'one', 'person', 'get', 'upgrade']
['still_wait', 'compensate', 'crap', 'put', 'honeymoon', 'almost', 'year', 'ago']
['really', 'hour', 'international', 'seat', 'back', 'entertainment', 'screen', 'mon']
['still', 'rather', 'screen', 'though', 'rather', 'cause', 'neck', 'problems', 'look', 'tablet']
['say', 'attenants', 'bdl', 'ord', 'awesome', 'gate_agents', 'bdl', 'also', 'need', 'shout', 'awesome', 'customer_service', 'treat', 'nonrev', 'well']


In [16]:
def bigrams(text):
    return [bigram_mod[doc] for doc in text]

In [17]:
data["question_lem_bigrams"] = bigrams(data["question_lem"])

In [18]:
data = data[['responce', 'question','question_lem_bigrams']]

#### Building LDA

In [19]:
import gensim.corpora as corpora

In [20]:
id2word = corpora.Dictionary(data["question_lem_bigrams"])
id2word.filter_extremes(no_below=3)
corpus = [id2word.doc2bow(text) for text in data["question_lem_bigrams"]]

In [21]:
lda_model_10 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10)

In [22]:
lda_model_20 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=20)

In [23]:
lda_model_30 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=30)

In [24]:
lda_model_40 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=40)

In [25]:
lda_model_80 = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=80)

In [26]:
print("lda_model_10: ",lda_model_10[corpus[0]])
print("lda_model_20: ",lda_model_20[corpus[0]])
print("lda_model_30: ",lda_model_30[corpus[0]])
print("lda_model_40: ",lda_model_40[corpus[0]])
print("lda_model_80: ",lda_model_80[corpus[0]])

lda_model_10:  [(0, 0.020005595), (1, 0.02000417), (2, 0.8199487), (3, 0.020005241), (4, 0.020006534), (5, 0.020008605), (6, 0.0200052), (7, 0.02000525), (8, 0.020005982), (9, 0.020004762)]
lda_model_20:  [(0, 0.010000518), (1, 0.010000518), (2, 0.010000518), (3, 0.010000518), (4, 0.010000518), (5, 0.010000518), (6, 0.010000518), (7, 0.010000518), (8, 0.010000518), (9, 0.010000518), (10, 0.010000518), (11, 0.010000518), (12, 0.31222904), (13, 0.0100005185), (14, 0.010000518), (15, 0.50776166), (16, 0.010000518), (17, 0.010000518), (18, 0.010000518), (19, 0.010000518)]
lda_model_30:  [(4, 0.80666256)]
lda_model_40:  [(10, 0.31332365), (32, 0.49666384)]
lda_model_80:  [(28, 0.8024955)]


our example corpus[0] is allegedly well classified in a topic by ~80%. but sometimes the lda_models hesitates between topics

display the topic's top 10 words

In [27]:
#example with 20 topics
topwords = np.array(lda_model_20.show_topics(num_topics=80, num_words=10))
for i in enumerate(topwords[:,1]):
    print(i)

(0, '0.014*"great" + 0.014*"never" + 0.012*"land" + 0.012*"later" + 0.012*"delay" + 0.010*"know" + 0.010*"would" + 0.010*"schedule" + 0.010*"make" + 0.010*"way"')
(1, '0.024*"delay" + 0.014*"really" + 0.012*"travel" + 0.011*"experience" + 0.010*"hey" + 0.010*"tell" + 0.010*"time" + 0.010*"lose" + 0.009*"service" + 0.009*"make"')
(2, '0.018*"day" + 0.014*"one" + 0.012*"gate" + 0.012*"make" + 0.010*"ticket" + 0.009*"pilot" + 0.009*"delay" + 0.008*"wait" + 0.007*"trip" + 0.007*"want"')
(3, '0.017*"know" + 0.012*"get" + 0.011*"issue" + 0.010*"seat" + 0.010*"ticket" + 0.009*"see" + 0.009*"still" + 0.009*"way" + 0.009*"one" + 0.008*"day"')
(4, '0.021*"seat" + 0.017*"time" + 0.015*"pay" + 0.015*"charge" + 0.015*"check" + 0.014*"use" + 0.011*"back" + 0.010*"way" + 0.010*"let" + 0.010*"work"')
(5, '0.015*"make" + 0.014*"pay" + 0.013*"another" + 0.011*"miami" + 0.011*"delay" + 0.010*"check" + 0.010*"get" + 0.010*"would" + 0.010*"stop" + 0.009*"gate"')
(6, '0.016*"try" + 0.016*"seat" + 0.014*"mak

#### pyLDAvis: Interactive Topic Modelling Exploration

In [28]:
#!pip install pyLDAvis.gensim

In [29]:
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
pyLDAvis.enable_notebook()
visual_10 = pyLDAvis.gensim_models.prepare(lda_model_10, corpus, id2word)
visual_20 = pyLDAvis.gensim_models.prepare(lda_model_20, corpus, id2word)
visual_30 = pyLDAvis.gensim_models.prepare(lda_model_30, corpus, id2word)
visual_40 = pyLDAvis.gensim_models.prepare(lda_model_40, corpus, id2word)
visual_80 = pyLDAvis.gensim_models.prepare(lda_model_80, corpus, id2word)

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


In [31]:
visual_10

In [32]:
visual_20

In [33]:
visual_30

In [34]:
visual_40

In [35]:
visual_80

The less it overlaps and the better is the distribution into topics

#### Model coherence

In [36]:
# lda model's coherence score
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda_10 = CoherenceModel(model=lda_model_10, texts=data['question_lem_bigrams'], dictionary=id2word, coherence='c_v')
coherence_model_lda_20 = CoherenceModel(model=lda_model_20, texts=data['question_lem_bigrams'], dictionary=id2word, coherence='c_v')
coherence_model_lda_30 = CoherenceModel(model=lda_model_30, texts=data['question_lem_bigrams'], dictionary=id2word, coherence='c_v')
coherence_model_lda_40 = CoherenceModel(model=lda_model_40, texts=data['question_lem_bigrams'], dictionary=id2word, coherence='c_v')
coherence_model_lda_80 = CoherenceModel(model=lda_model_80, texts=data['question_lem_bigrams'], dictionary=id2word, coherence='c_v')
coherence_lda_10 = coherence_model_lda_10.get_coherence()
coherence_lda_20 = coherence_model_lda_20.get_coherence()
coherence_lda_30 = coherence_model_lda_30.get_coherence()
coherence_lda_40 = coherence_model_lda_40.get_coherence()
coherence_lda_80 = coherence_model_lda_80.get_coherence()
print('Coherence Score_10: ', coherence_lda_10)
print('Coherence Score_20: ', coherence_lda_20)
print('Coherence Score_30: ', coherence_lda_30)
print('Coherence Score_40: ', coherence_lda_40)
print('Coherence Score_80: ', coherence_lda_80)

Coherence Score_10:  0.24611084181772536
Coherence Score_20:  0.30770951317840295
Coherence Score_30:  0.3434900865558782
Coherence Score_40:  0.3697981172793267
Coherence Score_80:  0.4233193062896493


more topics is more coherent but we must pick a good fit between coherence and a number of topics where classification is good enough

#### Try to extract topic from new text 

In [37]:
from operator import itemgetter

In [38]:
def preprocess(text):
    text_cleaned = gensim.utils.simple_preprocess(question_preprocess(text))
    text_lem = bigram_mod[lemmat(" ".join([w for w in text_cleaned if w not in stop_words]))]
    return text_lem

In [39]:
def get_highest_topic(text):
  text_preprocessed = preprocess(text)
  result = lda_model_30[id2word.doc2bow(text_preprocessed)]
  return max(result, key=itemgetter(1))[0]

get_highest_topic('@AmericanAir terrible service wait ages trying to call customer service almost two months gone no response')

22

In [40]:
data['topic'] = data['question'].apply(get_highest_topic)
data.head(10)

Unnamed: 0,responce,question,question_lem_bigrams,topic
0,@115904 We'll be sure to pass along your kind words! #AATeam,@AmericanAir Erica on the lax team is amazing give her a raise ty,"[erica, lax, team, amaze, give, raise]",4
1,@115904 Our apologies for the delay in responding to you. Have you made it to LAX? Let us know if you still need assistance.,@AmericanAir Could you have someone on your lax team available to guide me to my gate ASAP,"[could, someone, lax, team, available, guide, gate, asap]",8
2,"@115905 Aww, that's definitely a future pilot in the making! #HappyHalloween",Ben Tennyson and an American Airlines pilot. 🎃 #trunkortreat #halloween #2017 #diycostume #parenting @americanair … https://t.co/f1nNHQ0iLa https://t.co/lDViDkRdB1,"[ben, tennyson, american_airlines, pilot]",29
3,@115906 We're sorry for your frustration.,"@AmericanAir Right, but I earned those. I also shouldn’t have to pay to pass them to my own spouse. You need to change your program.","[right, earn, also, pay, pass, spouse, need, change, program]",21
4,@115909 We're glad you got to kick back and enjoy a show while flying! Thanks for your kind words.,"Thank you, @AmericanAir for playing #ThisIsUs and for having great flight attendants on my flight back home!","[play, great, attendants, back, home]",2
5,@116142 We never want your experience to be anything less than perfect. We're sorry the Wi-Fi was slower than expected.,@AmericanAir's wifi makes Amtrak's wifi look pro. At least theirs is free. @5804 has decent wifi @ reasonable prices. Why not AA?,"[wifi, make, amtrak, wifi, look, pro, least, free, decent, wifi, reasonable, price]",0
6,"@116143 Thanks for the shout-out and for stopping by, Marc! We'll pass this on to our SFO Admirals Club managers.","Wonderful club! @americanair (@ American Airlines Admirals Club in San Francisco, CA) https://t.co/GWFwuGVgNA","[wonderful, club, american_airlines, admirals_club, san, francisco]",28
7,@116144 They'll be more than happy to walk you through it and help with the booking process.,"@AmericanAir already did...changed browsers, did all my techie tricks and still can't make a reservation after 1 hour!!","[already, change, browsers, techie, trick, still, make, reservation, hour]",24
8,@116144 We're sorry for the long wait. The next available agent will be with you as soon as possible.,@AmericanAir ........still....on....hold.....they made the booking. I am trying to pay.....so #frustrating.,"[still, hold, make, book, try, pay]",20
9,@116144 Fares sell in real time. We're sorry we weren't able to snag the lower fare for you.,@AmericanAir well now i am told the ticket cost is an additional $200/person to pay....because of the wait....omg,"[well, tell, ticket, cost, additional, person, pay, wait, omg]",14


#### Building Intents

In [41]:
#custom intents
intents = {"Greeting": ["Hello", "How are you doing?", "Greetings!", "How do you do?"],
          "Apology": ["No problem"],
          "Thanks": ["No problem", "You're welcome"],
          "Goodbye": ["It was nice speaking to you", "See you later", "Speak soon!"]
             }

In [42]:
def update_intents(df):
  for index, row in df.iterrows():
    if str(row['topic']) not in intents :
      intents[str(row['topic'])] = []
    if row['responce'] != "":
      intents[str(row['topic'])].append(str(row['responce']))

In [43]:
update_intents(data)

Save Intents in order to use it again

In [44]:
with open('intents.json', 'w') as fp:
    json.dump(intents, fp)

#### Doc2Vec

In [45]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [46]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(data['responce'])]
model = Doc2Vec(tagged_data, window=1, min_count=2, epochs = 200)

#### First Results 

In [47]:
def chatbot(text):
  result = model.dv.most_similar(positive=[model.infer_vector(preprocess(text))], topn=1850)
  topic = get_highest_topic(text)
  for l in result:
    if data['topic'].iloc[int(l[0])] == topic:
      return data['responce'].iloc[int(l[0])]

In [48]:
#Preexisting questions and responses testing

list_word = ["@AmericanAir what’s going on with flight 301 DTW &gt; DFW? Delayed over 3 hours. Will it be canceled?", 
             "@AmericanAir awful service, more than 1 hour delay due to logbook maintenance issues. Where is preventive maintenance?",
             "@AmericanAir Yes. Refund my plane ticket😡 with your racist ass workers",
             "@AmericanAir I'm aware. Fits in overhead.This was a return trip. Had same luggage going. This was pre-security. Gate chck, ok, but Empty overheads on flt",
             "Thank you @AmericanAir for destroying not one but BOTH pieces of luggage we checked with you. Paid a checked baggage fee so our luggage could be ruined. Never again."]

list_resp = ["Looks like it has a maintenance delay and it's currently scheduled to take off at 10:10p.",
             "We do many checks pre and post departure. Please share your flight number if we can provide an update.",
             "We don't tolerate discrimination of any kind. Please DM your record locator and contact details (phone and email)",
             "We'd like to share this feedback. Please DM your record locator.",
             "We're sorry about that, Tyler. Please take your bags to the Baggage Service Office to file a claim."]

for q,r in zip(list_word, list_resp):
  print("QUESTION :", q)
  print("EXPECTED :", r)
  print("PREDICTED :", chatbot(q),"\n")
  


QUESTION : @AmericanAir what’s going on with flight 301 DTW &gt; DFW? Delayed over 3 hours. Will it be canceled?
EXPECTED : Looks like it has a maintenance delay and it's currently scheduled to take off at 10:10p.
PREDICTED : @157794 We want to reunite you quickly with your bags. Please work directly with our Baggage team. 

QUESTION : @AmericanAir awful service, more than 1 hour delay due to logbook maintenance issues. Where is preventive maintenance?
EXPECTED : We do many checks pre and post departure. Please share your flight number if we can provide an update.
PREDICTED : @123413 We'll update your claim via this link: https://t.co/Lts67cwTlN 

QUESTION : @AmericanAir Yes. Refund my plane ticket😡 with your racist ass workers
EXPECTED : We don't tolerate discrimination of any kind. Please DM your record locator and contact details (phone and email)
PREDICTED : @137446 We aren't expecting any cancellations during the holiday season. If that changes though, we'll be sure to notify you,

In [49]:
question = input('Enter your question : ')
chatbot(question)

"@116418 We'd like to share this feedback. Please DM your record locator."