# <center>Analyse exploratoire des données Microsoft Frames </center>

Import du fichier : 

In [2]:
import json

with open('./data/frames.json') as inputfile : 
    frames = json.load(inputfile)

Observation du contenu des différents niveaux de l'objet json : 

In [3]:
frames[1368].keys()

dict_keys(['user_id', 'turns', 'wizard_id', 'id', 'labels'])

In [4]:
frames[1368]['labels']

{'userSurveyRating': 5.0, 'wizardSurveyTaskSuccessful': True}

Il y a ici un rating de satisfaction du dialogue - voyons la distribution des scores : 

In [5]:
import pandas as pd
ratings = pd.Series([frames[i]['labels']['userSurveyRating'] for i in range(len(frames))])

In [6]:
ratings.isna().sum()

3

In [7]:
ratings.value_counts()

5.00    982
4.00    215
3.00     83
4.50     29
2.00     28
1.00     25
3.50      2
4.80      1
4.99      1
dtype: int64

Vu les notes de l'ensemble, on ne gardera pour notre modèle que les interactions ayant un rating d'au moins 4 (on retire 138 interactions soit environ 10% de nos données).

Voyons maintenant les autres contenus : 

In [8]:
frames[1368]['turns'][0]

{'text': "I need to book a trip for the whole family from Tampa but we don't know where to go. Can you help us?",
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Tampa', 'key': 'or_city'},
     {'val': '-1', 'key': 'dst_city'}],
    'name': 'inform'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Tampa', 'key': 'or_city'},
     {'val': '-1', 'key': 'dst_city'}],
    'name': 'inform'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'or_city': [{'val': 'Tampa', 'negated': False}],
     'dst_city': [{'val': '-1', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1473717094526.0}

In [34]:
frames[1]['turns'][12]['labels']['acts'][0]['name']

'switch_frame'

In [12]:
frames[1368]['turns'][0]['labels']['acts_without_refs']

[{'args': [{'val': 'book', 'key': 'intent'}], 'name': 'inform'},
 {'args': [{'val': 'Tampa', 'key': 'or_city'},
   {'val': '-1', 'key': 'dst_city'}],
  'name': 'inform'}]

In [13]:
frames[1368]['turns'][0]['labels']['active_frame']

1

La section info semble contenir la synthèse des infos engrangées sur l'interaction : 

In [14]:
for key,value in frames[0]['turns'][0]['labels']['frames'][0]['info'].items():
    print( f'\n\nkey : {key}  value : {value}')



key : intent  value : [{'val': 'book', 'negated': False}]


key : budget  value : [{'val': '1700.0', 'negated': False}]


key : dst_city  value : [{'val': 'Atlantis', 'negated': False}]


key : or_city  value : [{'val': 'Caprica', 'negated': False}]


key : str_date  value : [{'val': 'august 13', 'negated': False}]


key : n_adults  value : [{'val': '8', 'negated': False}]


<br>La clé text contient la phrase envoyée par l'utilisateur au tour 0 : 

In [15]:
frames[0]['turns'][0]['text']

"I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."

La section labels contient des infos plus détaillées la différence entre with et without refs n'est pour l'instant pas comprise : 

In [16]:
frames[0]['turns'][0]['labels'].keys()

dict_keys(['acts', 'acts_without_refs', 'active_frame', 'frames'])

In [63]:
frames[1]['turns'][0]['labels']['acts']

[{'args': [{'val': 'book', 'key': 'intent'}], 'name': 'inform'},
 {'args': [{'val': 'Mos Eisley', 'key': 'dst_city'},
   {'val': 'Gotham City', 'key': 'or_city'},
   {'val': '2100', 'key': 'budget'}],
  'name': 'inform'},
 {'args': [], 'name': 'greeting'}]

In [64]:
frames[1]['turns'][0]['labels']['acts_without_refs']

[{'args': [{'val': 'book', 'key': 'intent'}], 'name': 'inform'},
 {'args': [{'val': 'Mos Eisley', 'key': 'dst_city'},
   {'val': 'Gotham City', 'key': 'or_city'},
   {'val': '2100', 'key': 'budget'}],
  'name': 'inform'},
 {'args': [], 'name': 'greeting'}]

In [18]:
args_dict = {}
for elt in frames[0]['turns'][0]['labels']['acts'][1]['args']:
    args_dict[elt['key']] = elt['val']
args_dict

{'dst_city': 'Atlantis',
 'or_city': 'Caprica',
 'str_date': 'Saturday, August 13, 2016',
 'n_adults': '8',
 'budget': '1700'}

In [19]:
args_noref_dict = {}
for elt in frames[0]['turns'][0]['labels']['acts_without_refs'][1]['args']:
    args_noref_dict[elt['key']] = elt['val']
args_noref_dict

{'dst_city': 'Atlantis',
 'or_city': 'Caprica',
 'str_date': 'Saturday, August 13, 2016',
 'n_adults': '8',
 'budget': '1700'}

## Essais de préparation des données à l'apprentissage avec LUIS : 

Mise en forme des données pour un apprentissage basé seulement sur la compréhension de la 1ère demande : 

In [22]:
def get_example_label(statement, entity_name, value):
            statement = statement.lower()
            value = value.lower()
            return {
                'entity_name': entity_name,
                'start_char_index': statement.find(value),
                'end_char_index': statement.find(value) + len(value)
            }

In [77]:
from IPython.display import clear_output

v1_keys = ['or_city', 'dst_city', 'str_date', 'end_date', 'budget']

def build_training_dict(frames=frames, noref = False,
                        required_keys = v1_keys):
    training_list = []
    args_list = []
    for i in range(len(frames)):
        print(f'now_processing_interaction {i} of {len(frames)}')
        clear_output(wait=True)
        text = frames[i]['turns'][0]['text']
        if noref : 
            data = frames[i]['turns'][0]['labels']['acts_without_refs']
        else : 
            data = frames[i]['turns'][0]['labels']['acts']
        args_dict = {}
        frames[i]['labels']['userSurveyRating'] = frames[i]['labels']['userSurveyRating']\
        if frames[i]['labels']['userSurveyRating'] else 0
        
        if frames[i]['labels']['userSurveyRating']>=4:
            for i in range(len(data)): 
                for elt in data[i]['args']:
                    if 'key' in elt.keys() and 'val' in elt.keys() : 
                        if elt['key'] in required_keys : 
                            args_dict[elt['key']] = elt['val']
                            args_list.append(args_dict)
            training_list.append({'text' : text,
                              'intent_name' : "book",
                              'entity_labels' : [
                                  get_example_label(text, key, value) for key, value in args_dict.items()
                              ]
                             }
                            )
    return training_list, args_list

In [78]:
trainlist, args = build_training_dict()

now_processing_interaction 1368 of 1369


In [80]:
trainlist[:5]

[{'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'intent_name': 'book',
  'entity_labels': [{'entity_name': 'dst_city',
    'start_char_index': 27,
    'end_char_index': 35},
   {'entity_name': 'or_city', 'start_char_index': 41, 'end_char_index': 48},
   {'entity_name': 'str_date', 'start_char_index': 52, 'end_char_index': 77},
   {'entity_name': 'budget', 'start_char_index': 117, 'end_char_index': 121}]},
 {'text': "Hi I'd like to go to Caprica from Busan, between Sunday August 21, 2016 and Wednesday August 31, 2016",
  'intent_name': 'book',
  'entity_labels': [{'entity_name': 'dst_city',
    'start_char_index': 21,
    'end_char_index': 28},
   {'entity_name': 'or_city', 'start_char_index': 34, 'end_char_index': 39},
   {'entity_name': 'str_date', 'start_char_index': 49, 'end_char_index': 71},
   {'entity_name': 'end_date',
    'start_char_index': 76,
    'end_char_index': 101}]},
 {'text': 'Hel

Voici donc à quoi pourrait ressembler notre jeu d'apprentissage : 

In [81]:
trainset = trainlist[:1000]
testset = trainlist[1000:]

On va coder une classe permettant l'entraînement et l'évaluation du modèle sur la base de cette structure de données.