### Dialogflow Analyses

Shows how to automatically ...
- authenticate with gcloud
- replicate a chosen agent into two clones
- split the training phrases / configurations / NLU settings
- modify an agent and handle API quota limits
- run a k-fold (in this case, k=2) evaluation on the clones
- gather statistics on various tests
- calculate training phrase / intent collisions and confidence scores
- calculate entity / synonym collisions and confidence scores
- automatically export an agent and apply cxlint analysis

In [None]:
#@title install deps
%pip install tensorflow tensorflow-cpu scann dfcx-scrapi gspread gspread-dataframe tabulate pandas oauth2client cxlint google.cloud.storage google.cloud google google.colab --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.6/225.6 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.8/248.8 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.1/524.1 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m83.3 MB/s[0m 

In [None]:
# @title auth with your user account
from google.colab import auth
from google.auth import default

project_id = "gbot-test-065"  # @param
auth.authenticate_user()
creds, _ = default(quota_project_id=project_id)


In [None]:
# @title variables
import os
from pprint import pprint

source_agent_id = f"projects/{project_id}/locations/us-central1/agents/f7ec8404-df72-4ae6-9aa0-34a677c70b15"  # @param
# backup 1 & 2 (for replicating training phrases/intents)
advanced_nlu_agent_id = f"projects/{project_id}/locations/us-central1/agents/8506c18c-6537-4984-8b78-345a5a16c08a"  # @param
standard_nlu_agent_id = f"projects/{project_id}/locations/us-central1/agents/6988c861-4916-4391-82a0-f7233176d39b"  # @param
##
# optionally, you could modify the code to create the test agents on the fly (and delete them afterwards)
#

# GCS bucket for exports
gcs_path = "gs://keas-nlu-analysis-example"  # @param
# where to clone your source agent to
gcs_path_blob = f"{gcs_path}/snlu-vs-anlu-export-blob"  # @param
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id


In [None]:
# @title export source_agent to cloud storage
from dfcx_scrapi.core.agents import Agents
from dfcx_scrapi.core.operations import Operations
from google.longrunning.operations_pb2 import Operation as GOp
import time


def waitForOp(lro_id: str):
    o = Operations(creds=creds)
    opStatus: GOp = o.get_lro(lro_id)
    while not opStatus.done:
        time.sleep(1)
        opStatus: GOp = o.get_lro(lro_id)
        pprint(opStatus)


waitForOp(Agents(creds=creds).export_agent(agent_id=source_agent_id, gcs_bucket_uri=gcs_path_blob))


name: "projects/gbot-test-065/locations/us-central1/operations/20230829-13231693340600-64e929b7-0000-2fef-a60b-001a1143d982"

name: "projects/gbot-test-065/locations/us-central1/operations/20230829-13231693340600-64e929b7-0000-2fef-a60b-001a1143d982"
metadata {
  type_url: "type.googleapis.com/google.protobuf.Struct"
}
done: true
response {
  type_url: "type.googleapis.com/google.cloud.dialogflow.cx.v3beta1.ExportAgentResponse"
  value: "\n7gs://keas-nlu-analysis-example/snlu-vs-anlu-export-blob"
}



In [None]:
# @title replicate that blob into both cloned agents
waitForOp(Agents(creds=creds).restore_agent(agent_id=standard_nlu_agent_id, gcs_bucket_uri=gcs_path_blob))
waitForOp(Agents(creds=creds).restore_agent(agent_id=advanced_nlu_agent_id, gcs_bucket_uri=gcs_path_blob))


name: "projects/gbot-test-065/locations/us-central1/operations/20230829-13231693340618-64e92cd2-0000-28f5-a6fb-30fd38134aa4"
metadata {
  type_url: "type.googleapis.com/google.protobuf.Struct"
}
done: true
response {
  type_url: "type.googleapis.com/google.protobuf.Empty"
}

name: "projects/gbot-test-065/locations/us-central1/operations/20230829-13231693340620-64e9329a-0000-233e-baac-14c14eea03d4"

name: "projects/gbot-test-065/locations/us-central1/operations/20230829-13231693340620-64e9329a-0000-233e-baac-14c14eea03d4"
metadata {
  type_url: "type.googleapis.com/google.protobuf.Struct"
}
done: true
response {
  type_url: "type.googleapis.com/google.protobuf.Empty"
}



In [None]:
# @title force the aNLU flows to all be advanced NLU config
from dfcx_scrapi.core.flows import Flows

anlu_flows = Flows(creds=creds).get_flows_map(advanced_nlu_agent_id)
pprint(anlu_flows)

for id in anlu_flows:
    Flows(creds=creds).update_nlu_settings(flow_id=id, model_type=3, classification_threshold=0.3)


{'projects/gbot-test-065/locations/us-central1/agents/8506c18c-6537-4984-8b78-345a5a16c08a/flows/00000000-0000-0000-0000-000000000000': 'Default '
                                                                                                                                        'Start '
                                                                                                                                        'Flow'}


In [None]:
# @title split training phrases b/w sNLU and aNLU agents for k-fold analysis
from dfcx_scrapi.tools import nlu_util
from dfcx_scrapi.tools import dataframe_functions
from dfcx_scrapi.tools import levenshtein
from dfcx_scrapi.tools import search_util
from dfcx_scrapi.tools import stats_util
from dfcx_scrapi.core.intents import Intents
from dfcx_scrapi.core.agents import Agents
from pprint import pprint
import time

time.sleep(60 / 3.0)  # ensure starting with fresh per-minute quota
for intent in Intents(creds=creds).list_intents(agent_id=standard_nlu_agent_id):
    tps = intent.training_phrases
    intent.training_phrases = [tps[i] for i in range(0, len(tps)) if i % 2 == 0]
    Intents(creds=creds).update_intent(intent.name, intent, training_phrases=intent.training_phrases)
    time.sleep(1.0 / 3.0)

for intent in Intents(creds=creds).list_intents(agent_id=advanced_nlu_agent_id):
    tps = intent.training_phrases
    intent.training_phrases = [tps[i] for i in range(0, len(tps)) if i % 2 == 1]
    Intents(creds=creds).update_intent(intent.name, intent, training_phrases=intent.training_phrases)
    time.sleep(1.0 / 3.0)


In [None]:
# @title Create method to k-fold training phrases for sNLU & aNLU
from dfcx_scrapi.core.conversation import DialogflowConversation
from dfcx_scrapi.core.pages import Pages
from dfcx_scrapi.core.flows import Flows
from dfcx_scrapi.core.intents import Intents
from dfcx_scrapi.core.transition_route_groups import TransitionRouteGroups
import pandas as pd
import itertools


def get_test_set(id, alt_id):
    df = []
    intents = Intents(creds=creds).bulk_intent_to_df(alt_id)  # ['display_name', 'training_phrase']
    flows_map = Flows(creds=creds).get_flows_map(id)
    intents_map = Intents(creds=creds).get_intents_map(id)  # intent id => intent display name
    for flow in Flows(creds=creds).list_flows(id):
        for page in Pages(creds=creds).list_pages(flow.name):
            for rg in page.transition_routes:
                if not rg.intent:
                    continue
                for tp in intents[intents.display_name == intents_map[rg.intent]]["training_phrase"]:
                    df.append({"flow_display_name": flow.display_name, "page_display_name": page.display_name, "expected_intent": intents_map[rg.intent], "utterance": tp})
            for rg in page.transition_route_groups:
                _rg = TransitionRouteGroups(creds=creds).get_transition_route_group(rg)
                for route in _rg.transition_routes:
                    if not route.intent:
                        continue
                    for tp in intents[intents.display_name == intents_map[route.intent]]["training_phrase"]:
                        df.append(
                            {"flow_display_name": flow.display_name, "page_display_name": page.display_name, "expected_intent": intents_map[route.intent], "utterance": tp}
                        )

    return pd.DataFrame.from_records(df)


In [None]:
snlu_test_set = get_test_set(standard_nlu_agent_id, advanced_nlu_agent_id)
snlu_test_set


Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot the flight number
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost it
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i have no idea
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number isn t available
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost the flight number
...,...,...,...,...
878,Default Start Flow,Anything Else,flights.senior_discount,I want an old lady discount
879,Default Start Flow,Anything Else,flights.senior_discount,I am in newly retired. can I get a senior disc...
880,Default Start Flow,Anything Else,flights.senior_discount,How about a discount for over 65. Is that avai...
881,Default Start Flow,Anything Else,flights.senior_discount,Do senior spouses get a discount


In [None]:
anlu_test_set = get_test_set(advanced_nlu_agent_id, standard_nlu_agent_id)
anlu_test_set


Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i never remember that kind of stuff
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot it
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i don't have a clue
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i need you to find the flight number for me
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number waslost
...,...,...,...,...
918,Default Start Flow,Anything Else,flights.senior_discount,I need a golden girls discount to buy the tick...
919,Default Start Flow,Anything Else,flights.senior_discount,How much is the elderly discount
920,Default Start Flow,Anything Else,flights.senior_discount,Do you have a senior citizen discount
921,Default Start Flow,Anything Else,flights.senior_discount,Can my whole family use the senior discount


In [None]:
# @title Testing sNLU against the training phrases we split off for aNLU
dc = DialogflowConversation(creds=creds)
dc.agent_id = standard_nlu_agent_id
snlu_results = dc.run_intent_detection(test_set=snlu_test_set, chunk_size=900, rate_limit=20)
snlu_results




Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance,target_page,match_type,confidence,parameters_set,detected_intent
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot the flight number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i forgot the flight number'},
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost it,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i lost it'},
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i have no idea,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i have no idea'},
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number isn t available,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'the flight number isn t av...,
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost the flight number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i lost the flight number'},
...,...,...,...,...,...,...,...,...,...
878,Default Start Flow,Anything Else,flights.senior_discount,I want an old lady discount,Anything Else,INTENT,1.0,,flights.senior_discount
879,Default Start Flow,Anything Else,flights.senior_discount,I am in newly retired. can I get a senior disc...,Anything Else,INTENT,1.0,,flights.senior_discount
880,Default Start Flow,Anything Else,flights.senior_discount,How about a discount for over 65. Is that avai...,Anything Else,INTENT,1.0,,flights.senior_discount
881,Default Start Flow,Anything Else,flights.senior_discount,Do senior spouses get a discount,Anything Else,INTENT,1.0,,flights.senior_discount


In [None]:
snlu_results[snlu_results.expected_intent != snlu_results.detected_intent]


Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance,target_page,match_type,confidence,parameters_set,detected_intent
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot the flight number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i forgot the flight number'},
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost it,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i lost it'},
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i have no idea,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i have no idea'},
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number isn t available,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'the flight number isn t av...,
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i lost the flight number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i lost the flight number'},
5,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i told you i dont have the flight nimber,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i told you i dont have the...,
6,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,"sont have it, sorry",Join Frequent Flyer,PARAMETER_FILLING,0.3,"{'passenger_name': 'sont have it, sorry'}",
7,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i have no idea what the flight number is,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i have no idea what the fl...,
8,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i don;t have it,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i don;t have it'},
9,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,I dont know the confirmation number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'I dont know the confirmati...,


In [None]:
s_facts = (snlu_results["confidence"].mean(), snlu_results["confidence"].min(), snlu_results["confidence"].median(), snlu_results["confidence"].std())
s_facts


(0.9556058897707498, 0.30000001192092896, 1.0, 0.1706986775558901)

In [None]:
# @title Testing aNLU against the training phrases we split off for sNLU
dc = DialogflowConversation(creds=creds)
dc.agent_id = advanced_nlu_agent_id
anlu_results = dc.run_intent_detection(test_set=anlu_test_set, chunk_size=900, rate_limit=20)
anlu_results




Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance,target_page,match_type,confidence,parameters_set,detected_intent
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i never remember that kind of stuff,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i never remember that kind...,
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot it,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i forgot it'},
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i don't have a clue,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i don't have a clue'},
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i need you to find the flight number for me,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i need you to find the fli...,
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number waslost,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'the flight number waslost'},
...,...,...,...,...,...,...,...,...,...
18,Default Start Flow,Anything Else,flights.senior_discount,I need a golden girls discount to buy the tick...,Anything Else,INTENT,1.0,,flights.senior_discount
19,Default Start Flow,Anything Else,flights.senior_discount,How much is the elderly discount,Anything Else,INTENT,1.0,,flights.senior_discount
20,Default Start Flow,Anything Else,flights.senior_discount,Do you have a senior citizen discount,Anything Else,INTENT,1.0,,flights.senior_discount
21,Default Start Flow,Anything Else,flights.senior_discount,Can my whole family use the senior discount,Anything Else,INTENT,1.0,,flights.senior_discount


In [None]:
anlu_results[anlu_results.expected_intent != anlu_results.detected_intent]


Unnamed: 0,flow_display_name,page_display_name,expected_intent,utterance,target_page,match_type,confidence,parameters_set,detected_intent
0,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i never remember that kind of stuff,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i never remember that kind...,
1,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i forgot it,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i forgot it'},
2,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i don't have a clue,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i don't have a clue'},
3,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i need you to find the flight number for me,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i need you to find the fli...,
4,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,the flight number waslost,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'the flight number waslost'},
5,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i never got the flight number,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i never got the flight num...,
6,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,dont have it at all,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'dont have it at all'},
7,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,dont know,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'dont know'},
8,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,who knows,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'who knows'},
9,Default Start Flow,Join Frequent Flyer,small_talk.i_don't_know,i dont know,Join Frequent Flyer,PARAMETER_FILLING,0.3,{'passenger_name': 'i dont know'},


In [None]:
a_facts = (anlu_results["confidence"].mean(), anlu_results["confidence"].min(), anlu_results["confidence"].median(), anlu_results["confidence"].std())
a_facts


(0.9597596329831612, 0.30000001192092896, 1.0, 0.16293521494377372)

In [None]:
# @title Calculate some useful stats
snlu_total = snlu_results["confidence"].count()
snlu_failed = snlu_results[snlu_results.expected_intent != snlu_results.detected_intent]["confidence"].count()
snlu_passed = snlu_total - snlu_failed
print(
    f"""
sNLU
===
confidence scores: mean ({s_facts[0]}), stddev ({s_facts[3]})
total tests: {snlu_total}
passing tests: {snlu_passed} ({snlu_passed / snlu_total * 100}%)
failed tests: {snlu_failed} ({snlu_failed / snlu_total * 100}%)
"""
)

anlu_total = anlu_results["confidence"].count()
anlu_failed = anlu_results[anlu_results.expected_intent != anlu_results.detected_intent]["confidence"].count()
anlu_passed = anlu_total - anlu_failed
print(
    f"""
aNLU
===
confidence scores: mean ({a_facts[0]}), stddev ({a_facts[3]})
total tests: {anlu_total}
passing tests: {anlu_passed} ({anlu_passed / anlu_total * 100}%)
failed tests: {anlu_failed} ({anlu_failed / anlu_total * 100}%)
"""
)



sNLU
===
confidence scores: mean (0.9556058897707498), stddev (0.1706986775558901)
total tests: 883
passing tests: 826 (93.54473386183464%)
failed tests: 57 (6.455266138165346%)


aNLU
===
confidence scores: mean (0.9597596329831612), stddev (0.16293521494377372)
total tests: 923
passing tests: 870 (94.2578548212351%)
failed tests: 53 (5.742145178764897%)



In [None]:
# @title Calculate semantic similarity of DFCX Pages
#
# Use NLU sentence embeddings to determine how similar different utterances are. We use this information to perform the following analyses:
#
# - Find similar training phrases in different intents that will cause confusion for the NLU model.
# - Identify the most similar training phrases for a user-supplied set of utterances. This will explain where incorrect predictions are coming from on an eval set.
# - Identify clusters of utterances that are unlike any of the phrases in the training data. This can be used to search through utterances that produced NO_MATCH in the logs and identify missing intents/training phrases.

from dfcx_scrapi.tools.nlu_util import KonaEmbeddingModel, SheetsLoader, NaturalLanguageUnderstandingUtil
from dfcx_scrapi.core.conversation import DialogflowConversation
from dfcx_scrapi.core.pages import Pages
from dfcx_scrapi.core.flows import Flows
from dfcx_scrapi.core.intents import Intents
from dfcx_scrapi.core.transition_route_groups import TransitionRouteGroups
import pandas as pd


def find_similar_training_phrases_in_different_intents(agent_id):
    df = pd.DataFrame()
    flows_map = Flows(creds=creds).get_flows_map(agent_id)  # flow_id => flow display name
    intents_map = Intents(creds=creds).get_intents_map(agent_id)  # intent id => intent display name
    tps = Intents(creds=creds)
    tps.bulk_intent_to_df(agent_id)  # ['display_name', 'training_phrase']
    intents = Intents(creds=creds).bulk_intent_to_df(agent_id)  # ['display_name', 'training_phrase']

    for flow in Flows(creds=creds).list_flows(agent_id):
        for page in Pages(creds=creds).list_pages(flow.name):
            for rg in page.transition_routes:
                if not rg.intent:
                    continue
                if intents[intents.display_name == intents_map[rg.intent]]["training_phrase"].count() != 0:
                    embedder = NaturalLanguageUnderstandingUtil(
                        creds=creds, agent_id=agent_id, flow_display_name=flow.display_name, page_display_name=page.display_name
                    )  # NLU embedder fails when there are no TPs (it doesn't check or use graceful logic)
                    df = pd.concat([df, embedder.find_similar_training_phrases_in_different_intents()])
            for rg in page.transition_route_groups:
                _rg = TransitionRouteGroups(creds=creds).get_transition_route_group(rg)
                for route in _rg.transition_routes:
                    if not route.intent:
                        continue
                    if intents[intents.display_name == intents_map[route.intent]]["training_phrase"].count() == 0:
                        embedder = NaturalLanguageUnderstandingUtil(
                            creds=creds, agent_id=agent_id, flow_display_name=flow.display_name, page_display_name=page.display_name
                        )  # NLU embedder fails when there are no TPs (it doesn't check or use graceful logic)
                        df = pd.concat([df, embedder.find_similar_training_phrases_in_different_intents()])

            # embedder = NaturalLanguageUnderstandingUtil(creds=creds, agent_id=agent_id, flow_display_name=flow.display_name, page_display_name=page.display_name)
            # df = pd.concat([df, embedder.find_similar_training_phrases_in_different_intents()])

    return pd.DataFrame.from_records(df)


In [None]:
source_agent_conflicting_tps = find_similar_training_phrases_in_different_intents(source_agent_id)


Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...
Loading ScaNN searcher...
Loading training data...
Loading embedder...
Generating embeddings for training data...


In [None]:
# conflicting_tps = source_agent_conflicting_tps[source_agent_conflicting_tps['Intent 1'] != source_agent_conflicting_tps['Intent 2']]
conflicting_tps = source_agent_conflicting_tps.drop_duplicates().groupby(["Training phrase 1", "Training phrase 2", "Intent 1", "Intent 2"]).mean()
conflicting_tps


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Similarity
Training phrase 1,Training phrase 2,Intent 1,Intent 2,Unnamed: 4_level_1
I don't think so,I think so,small_talk.confirmation.no,small_talk.confirmation.yes,0.850586
I thank you,thank you,small_talk.thanks,small_talk.confirmation.no,0.832066
I'd like to buy a one way ticket,I'd like to buy a double ticket,flights.book_flight,flights.round_trip_ticket,0.832842
I'd like to buy a one way ticket,I'd like to buy a single ticket,flights.book_flight,flights.one_way_ticket,0.876263
I'd like to buy a one way ticket,i want a one way ticket,flights.book_flight,flights.one_way_ticket,0.884216
I'd like to buy a one way ticket,want a one way ticket,flights.book_flight,flights.one_way_ticket,0.802861
I'd like to buy a single ticket,I'd like to buy a double ticket,flights.one_way_ticket,flights.round_trip_ticket,0.87622
can i get a one way ticket to london,can i get a round trip ticket to london,flights.one_way_ticket,flights.round_trip_ticket,0.902953
do you sell one way tickets to Entebbe,do you sell round trip tickets to Entebbe,flights.one_way_ticket,flights.round_trip_ticket,0.853472
i want a one way ticket,i want a round trip ticket,flights.one_way_ticket,flights.round_trip_ticket,0.830298


In [None]:
# @title Entity/Synonym Analyses Source Data
from dfcx_scrapi.core.entity_types import EntityTypes

source_agent_entity_types = EntityTypes(creds=creds).entity_types_to_df(source_agent_id)
source_agent_entity_types.head(20000)  # can only render 20k rows in Colab


Unnamed: 0,display_name,entity_value,synonyms
0,airport-codes,ABQ,ABQ
1,airport-codes,ABQ,Albuquerque\t
2,airport-codes,ABQ,Albuquerque International Sunport
509,airport-codes,ABZ,ABZ
510,airport-codes,ABZ,Aberdeen
...,...,...,...
0,frequent_flyer_account,@sys.number-sequence,@sys.number-sequence
0,origin-city,@airport-codes,@airport-codes
1,origin-city,@sys.geo-city,@geo-city
2,origin-city,@sys.geo-country,@sys.geo-country


In [None]:
# @title Entity/Synonym Analyses
import numpy as np
from dfcx_scrapi.core.entity_types import EntityTypes
from dfcx_scrapi.tools.nlu_util import KonaEmbeddingModel, SheetsLoader, NaturalLanguageUnderstandingUtil
import scann
import pandas as pd


def find_similar_entities(types, num_neighbors=3, min_similarity=0.87):
    allsynonyms = types["synonyms"].to_numpy(str)
    all_entities = types["display_name"].to_numpy(str)
    numsynonyms = len(allsynonyms)
    embedder = KonaEmbeddingModel()
    embeddings = embedder.embed(allsynonyms)
    normalized_dataset = embeddings / np.linalg.norm(embeddings, axis=1)[:, np.newaxis]
    searcher = scann.scann_ops_pybind.builder(normalized_dataset, num_neighbors, "dot_product").score_brute_force().build()
    all_idx_1 = np.tile(np.arange(numsynonyms)[:, None], num_neighbors)
    all_idx_2, similarities = searcher.search_batched(embeddings)

    def entities_differ(idx_1, idx_2):
        return all_entities[idx_1] != all_entities[idx_2] or allsynonyms[idx_1] != allsynonyms[idx_2]

    different_entity_mask = np.vectorize(entities_differ)(all_idx_1, all_idx_2)
    mismatch_mask = different_entity_mask & (similarities > min_similarity)
    mismatch_idx_1 = all_idx_1[mismatch_mask]
    mismatch_idx_2 = all_idx_2[mismatch_mask]
    mismatch_similarities = similarities[mismatch_mask]

    # Remove any duplicates
    sort_mask = mismatch_idx_1 > mismatch_idx_2
    sort_vals_1 = mismatch_idx_1[sort_mask]
    mismatch_idx_1[sort_mask] = mismatch_idx_2[sort_mask]
    mismatch_idx_2[sort_mask] = sort_vals_1
    (unique_idx_1, unique_idx_2), unique_index = np.unique([mismatch_idx_1, mismatch_idx_2], axis=1, return_index=True)
    unique_similarities = mismatch_similarities[unique_index]

    df = (
        pd.DataFrame(
            {
                "Synonym 1": allsynonyms[unique_idx_1],
                "Entity 1": all_entities[unique_idx_1],
                "Synonym 2": allsynonyms[unique_idx_2],
                "Entity 2": all_entities[unique_idx_2],
                "Similarity": unique_similarities,
            }
        )
        .sort_values("Similarity", ascending=False)
        .reset_index(drop=True)
    )

    return df


source_agent_entity_analyses = find_similar_entities(source_agent_entity_types).drop_duplicates()
source_agent_entity_analyses


Unnamed: 0,Synonym 1,Entity 1,Synonym 2,Entity 2,Similarity
0,@airport-codes,destination-city,@airport-codes,origin-city,1.0
1,A,flight-option,a,flight-option,1.0
2,@geo-city,destination-city,@geo-city,origin-city,1.0
3,@sys.date,departure-date,@sys.date,return-date,1.0
4,C,flight-option,c,flight-option,1.0
5,@sys.geo-country,destination-city,@sys.geo-country,origin-city,1.0
6,B,flight-option,b,flight-option,1.0
7,Holloman Air Force Base,airport-codes,Vandenberg Air Force Base,airport-codes,0.946092
8,Portland International Airport,airport-codes,Portland International Jetport Airport,airport-codes,0.944621
9,Holloman Air Force Base,airport-codes,Altus Air Force Base,airport-codes,0.941874


In [None]:
# @title Filter for synonyms clashing within the same entity
s = source_agent_entity_analyses
e1 = (
    s[s["Entity 1"] == s["Entity 2"]][["Entity 1", "Synonym 1", "Synonym 2"]]
    .groupby(["Synonym 1"], sort=False)
    .agg(lambda x: set(x))
    .rename(columns={"Entity 1": "in Entities", "Synonym 2": "clashes with.."})
)

e1


Unnamed: 0_level_0,in Entities,clashes with..
Synonym 1,Unnamed: 1_level_1,Unnamed: 2_level_1
A,{flight-option},{a}
C,{flight-option},{c}
B,{flight-option},{b}
Holloman Air Force Base,{airport-codes},"{McChord Air Force Base, Tyndall Air Force Bas..."
Portland International Airport,{airport-codes},{Portland International Jetport Airport}
Dyess Air Force Base,{airport-codes},"{McChord Air Force Base, Holloman Air Force Base}"
Savannah Hilton Head International Airport,{airport-codes},{Savannah Hilton International Airport}
Roanoke–Blacksburg Regional Airport,{airport-codes},{Tri-Cities Regional Airport}
Altus Air Force Base,{airport-codes},{Whiteman Air Force Base}
Manchester-Boston Regional Airport,{airport-codes},"{Tri-Cities Regional Airport, Roanoke–Blacksbu..."


In [None]:
# @title Filter for synonym clashes with other entities
e2 = s[s["Entity 1"] != s["Entity 2"]].groupby(["Entity 1", "Entity 2"], sort=False).agg(lambda x: set(x))
e2


Unnamed: 0_level_0,Unnamed: 1_level_0,Synonym 1,Synonym 2,Similarity
Entity 1,Entity 2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
destination-city,origin-city,"{@sys.geo-country, @airport-codes, @geo-city}","{@sys.geo-country, @airport-codes, @geo-city}","{0.9999997019767761, 1.0000004768371582, 1.000..."
departure-date,return-date,{@sys.date},{@sys.date},{1.0}


In [None]:
#@title CXLint (go/cxlint)
# Clear out any old agent data you have before downloading new data
agent_local_path = './cxlint/agent'
!rm -rf ./cxlint/agent/
!mkdir -p ./cxlint/agent

gcs_export_path = f'{gcs_path}/cxlint-export.zip' #@param
output_file_name = 'cxlint.txt' #@param

from cxlint.cxlint import CxLint
from cxlint.gcs_utils import GcsUtils
from pprint import pprint
from dfcx_scrapi.core.operations import Operations
import time
from google.longrunning.operations_pb2 import Operation as GOp
from google.cloud.dialogflowcx_v3beta1 import services
from google.cloud.dialogflowcx_v3beta1 import types
from google.cloud.dialogflowcx_v3beta1.types.agent import ExportAgentRequest
from dfcx_scrapi.core import scrapi_base
from dfcx_scrapi.core import environments

def export_agent(
    agent_id: str,
    gcs_bucket_uri: str,
    creds: str,
    environment_display_name: str = None
) -> str:
    scrapi = scrapi_base.ScrapiBase(creds=creds)
    request = types.agent.ExportAgentRequest()
    request.name = agent_id
    request.agent_uri = gcs_bucket_uri
    request.data_format = ExportAgentRequest.DataFormat.JSON_PACKAGE

    if environment_display_name:
        scrapi._environments = environments.Environments(creds=scrapi.creds)
        possible_environment = scrapi._environments.get_environments_map(agent_id=agent_id, reverse=True).get(environment_display_name)
        if possible_environment:
            request.environment = possible_environment
        else:
            raise ValueError("Invalid environment_display_name.", f" {environment_display_name} does not exist!")

    client_options = scrapi._set_region(agent_id)
    client = services.agents.AgentsClient(
        credentials=scrapi.creds, client_options=client_options
    )
    response = client.export_agent(request)

    return response.operation.name

# export agent to GCS
lro_id = export_agent(source_agent_id, gcs_export_path, creds)

o = Operations(creds=creds)
opStatus: GOp = o.get_lro(lro_id)
while not opStatus.done:
    time.sleep(1)
    opStatus: GOp = o.get_lro(lro_id)
    pprint(opStatus)

# download/unzip agent and run cxlint
naming_conventions = {
    "agent_name": ".*",
    "flow_name": ".*",
    "intent_head_name": "head_intent.*",
    "intent_confirmation_name": ".*",
    "intent_escalation_name": ".*",
    "intent_generic_name": ".*",
    "entity_type_name": ".*",
    "page_generic_name": ".*",
    "page_with_form_name": ".*",
    "page_with_webhook_name": ".*",
    "test_case_name": ".*",
    "webhook_name": ".*"
}

cx = CxLint(
    verbose=True,
    # load_gcs=True,
    agent_id=source_agent_id,
    naming_conventions=naming_conventions,
    # resource_filter=['flows', 'entity_types', 'webhooks'],
    # intent_include_pattern='ci',
    # intent_exclude_pattern='ci_'
    # flow_include_list=['Internet Services Down'],
    # flow_exclude_list=['Bill Balance', 'Bill Confusion'],
    # intent_pattern='ci',
    # test_case_pattern='SA'
    output_file=output_file_name
)

gcs = GcsUtils(project_id=project_id)
gcs.creds = creds
agent_file = gcs.download_gcs(gcs_export_path, agent_local_path)
gcs.unzip(agent_file, agent_local_path)
cx.lint_agent(agent_local_path)

name: "projects/gbot-test-065/locations/us-central1/operations/20230829-14021693342972-64e9295e-0000-24b1-ba02-883d24f62e98"

name: "projects/gbot-test-065/locations/us-central1/operations/20230829-14021693342972-64e9295e-0000-24b1-ba02-883d24f62e98"
metadata {
  type_url: "type.googleapis.com/google.protobuf.Struct"
}
done: true
response {
  type_url: "type.googleapis.com/google.cloud.dialogflow.cx.v3beta1.ExportAgentResponse"
  value: "\n0gs://keas-nlu-analysis-example/cxlint-export.zip"
}

