In [2]:
import os

In [29]:
import numpy as np
import pandas as pd
import json
import copy

def read_json(path):
    return json.load(open(path, 'r'))

In [4]:
def dump_json(path, obj):
    with open(path, 'w') as fn:
        json.dump(obj, fn)

In [6]:
def delete_duplicates(json_list):
    """
    Left only items with unique nl values
    in list of jsons which is looks like this:
    json_list[
             {'nl': 'THIS_SENT_IS_THE_KEY', ...},
             {...},
        ...]
    This method doesn't change json_list;
    Compare only nl values;
    
    """
    nls = []
    res = []
    for item in json_list:
        if item['nl'] not in nls:
            nls.append(item['nl'])
            res.append(item)
    print('unique items: {}'.format(len(res)))
    return res

In [7]:
def process_log(log, who='usr'):
    dialog = {}
    turns_arr = []
    for turn in log['turns']:
        acts_for_item = []
        tmp = {}
        tmp['slots'] = []
        tmp['values'] = []
        tmp['acts'] = []
        tmp['set_acts'] = []
        
        if who == 'usr':
            tmp['nl'] = turn['transcription']
            dialog[turn['turn-index']] = tmp['nl']
            semantics = turn['semantics']['json']
        elif who == 'agt':
            tmp['nl'] = turn['output']['transcript']
            dialog[turn['turn-index']] = tmp['nl']
            semantics = turn['output']['dialog-acts']
        else:
            raise Exception('Wrong value of "who" param!')
        
        # every sentence may have more then 1 slot;
        for item in semantics:
            act = item['act']
            if act:
                tmp['acts'].append(act)
                
            if item['slots']:
                slots = item['slots'][0] # because of such 
                                         #representation: [[]]
                if act != 'request':
                    tmp['slots'].append(slots[0])
                    tmp['values'].append(slots[1])
                else: # because in this case slots[0] == 'slot'
                    tmp['slots'].append(slots[1])
                    tmp['values'].append("NONE")
            
        tmp['set_acts'] = frozenset(tmp['acts'])
        turns_arr.append(tmp)
        
    return turns_arr, dialog

In [131]:
def make_placeholders(df):
    

    req_parse_vals = {
    'phone':['phone number', 'phone', 'number', 'telephone'],
    'addr':['address', 'addre', 'addrss', 'adddress'],
    'pricerange':['price range', 'price ran', 'price of food', 'price code', 'prices', 'price', 'cost'],
    'area':['area', 'part of town', 'location'],
    'food':['food', 'serves'],
    'postcode': ['postcode', 'post code', 'zip code', 'postal code'],
    'name':['name'] }
    
    pandas_df = copy.deepcopy(df)
    for i,row in enumerate(pandas_df.iterrows()):
        for slot,val in zip(row[1]['slots'], row[1]['values']):
            if (slot == 'addr') or (slot == 'postcode'):
                row[1]['nl'] = row[1]['nl'].replace(val.title(), "${}$".format(slot))
                if slot == 'postcode':
                    row[1]['nl'] = row[1]['nl'][:-7] # да, тут полно костылей, но иначе никак(
            elif slot!='this':
                if ((val!='NONE')&(val!='dontcare')):
                    row[1]['nl'] = row[1]['nl'].replace(val, "${}$".format(slot))
                elif (val == 'dontcare'):
                    for assump in ['any', 'you dont care']:
                        row[1]['nl'] = row[1]['nl'].replace(assump, "${}$".format(slot))
                elif (slot!='signature'):
                    # request case -- we don't know the exactly value of slot;
                    for v in req_parse_vals[slot]:
                        if v in row[1]['nl']:
                            row[1]['nl'] = row[1]['nl'].replace(v, "${}$".format(slot))
                            break
                
    pandas_df['nl'] = [' '.join(nl.replace('noise', '').split()) for nl in pandas_df['nl']]
    return pandas_df

In [164]:
def make_right_columns(df,who, wrong_acts = ['repeat', 'ack', 'restart']):
    """
    drop all rows which have even one of actions in wrong_acts;
    make 3 columns for all actions and 3 columns for all slots;
    drop duplicates;
    it doesn't change original df;
    remove 'noise' word from each nl
    
    """
    def replace_this_slot(df):
        def delete_unnecessary_acts(row):
            # если нет для второстепенных действий слотов, то убираем эти действия
            if (row['act2']!='') and (row['slot2']==''):
                row['act2']=''
            if (row['act3']!='') and (row['slot3']==''):
                row['act3']=''
                
        if who == 'usr':
            for row in df.iterrows():
                for i in range(1,4):
                    # если 'this' слоту соотвествует экшон 'inform', то заменяем его на 'dontcare':
                    if (row[1]['act'+str(i)]=='inform') and (row[1]['slot'+str(i)]=='this'):
                        row[1]['act'+str(i)] = 'dontcare'
                        row[1]['slot'+str(i)] = ''
                    elif row[1]['slot'+str(i)]=='this':
                        if row[1]['slot'+str((i+1)%4)]!='':
                            # если следующий за this слот есть, то ставим его на место this
                            # и соответствующий экшон тоже ставим на экшон this
                            row[1]['slot'+str(i)] = row[1]['slot'+str((i+1)%4)]
                            row[1]['slot'+str((i+1)%4)] = ''
                            row[1]['act'+str(i)] = row[1]['act'+str((i+1)%4)]
                            row[1]['act'+str((i+1)%4)] = ''
                        else:
                            row[1]['slot'+str(i)] = ''
                delete_unnecessary_acts(row[1])
        elif who == 'agt':
            for row in df.iterrows():
                if row[1]['act2'] == 'canthelp.exception':
                    row[1]['act2'] = ''
                    row[1]['act3'] = ''
                    row[1]['slot2'] = ''
                    row[1]['slot3'] = '' 
                delete_unnecessary_acts(row[1])
            
    new_df = copy.deepcopy(df)
    print('shape before:', new_df.shape)
    new_df['act1'] = [act[0] if act else '' for act in df['acts']]
    new_df['act2'] = [act[1] if len(act)>=2 else '' for act in df['acts']]
    new_df['act3'] = [act[2] if len(act)>=3 else '' for act in df['acts']]
    new_df['slot1'] = [slot[0] if slot else '' for slot in df['slots']]
    new_df['slot2'] = [slot[1] if len(slot)>=2 else '' for slot in df['slots']]
    new_df['slot3'] = [slot[2] if len(slot)>=3 else '' for slot in df['slots']]
    replace_this_slot(new_df)
    
    new_df.drop(new_df[new_df.act1 == ''].index, inplace=True) ## delete rows without any info
    new_df.drop(['acts', 'slots', 'set_acts', 'values'], axis=1, inplace=True)
    for act in wrong_acts:
        new_df.drop(new_df[(new_df.act1 == act)|(new_df.act2 == act)|(new_df.act3 == act)].index, inplace=True)
    new_df['nl'] = [nl.replace('$pricerange$ly', '$pricerange$') for nl in new_df['nl']]
    new_df.drop(new_df[new_df.nl == ''].index, inplace=True)
    
    new_df.drop_duplicates(inplace=True)
    print('shape after:', new_df.shape)
    return new_df

In [199]:
def canthelp_edit(df, ontology_path="../ontology_dstc2.json"):
    """
    add necessary slots to table using ontology file
    only to columns with canthelp action1
    
    """
    ontology = read_json(ontology_path)
    for row in df.iterrows():
        vacant = 2
        if row[1]['act1'] == 'canthelp':
            for price in ontology['informable']['pricerange']:
                if row[1]['nl'].find(price)>0:
                    row[1]['nl'] = row[1]['nl'].replace(price, '$pricerange$')
                    row[1]['slot'+str(vacant)] = 'pricerange'
                    row[1]['act'+str(vacant)] = 'canthelp'
                    vacant+=1
                    break
            for area in ontology['informable']['area']:
                if row[1]['nl'].find(area)>0:
                    row[1]['nl']= row[1]['nl'].replace(area, '$area$')
                    row[1]['slot'+str(vacant)] = 'area'
                    row[1]['act'+str(vacant)] = 'canthelp'
                    vacant+=1
                    break
            if vacant!=4:
                for food in ontology['informable']['food']:
                    if row[1]['nl'].find(food)>0:
                        row[1]['nl'] = row[1]['nl'].replace(food, '$food$')
                        row[1]['slot'+str(vacant)] = 'food'
                        row[1]['act'+str(vacant)] = 'canthelp'
                        break
    df.drop_duplicates(inplace=True)
    print('Now shape:', df.shape)

# Main

In [200]:
src_path = '../data/dstc2_all/original_data/'
train = '../data/dstc2_all/original_data/scripts/config/dstc2_train.flist'
trn_flist = np.array(pd.read_csv(train, sep='\n', header=None)).reshape(1612,)

diaacts_usr = []
diaacts_agt = []


with open("/Users/fogside/Projects/NLP_RF/dialogs.txt", 'w') as diatext:
    for i,f in enumerate(trn_flist):
        label_path = os.path.join(src_path, f, 'label.json')
        log_path = os.path.join(src_path, f, 'log.json')

        label = read_json(label_path)
        log = read_json(log_path)
        
        acts_agt, dialog_agt = process_log(log, 'agt')
        acts_usr, dialog_usr = process_log(label, 'usr')
        
        diaacts_usr.extend(acts_usr)
        diaacts_agt.extend(acts_agt)
        
        diatext.write(">{}\n".format(i))
        for j in range(len(dialog_agt)):
            diatext.write("{}\n".format(dialog_agt[j]))
            diatext.write("{}\n".format(dialog_usr[j]))

diaacts_usr = delete_duplicates(diaacts_usr)
diaacts_agt = delete_duplicates(diaacts_agt)

usr_df = pd.DataFrame(diaacts_usr)
agt_df = pd.DataFrame(diaacts_agt)

agt_df = make_placeholders(agt_df)
usr_df = make_placeholders(usr_df)

usr_df = make_right_columns(usr_df, 'usr')
agt_df = make_right_columns(agt_df, 'agt')

canthelp_edit(agt_df)

unique items: 2912
unique items: 1679
shape before: (2912, 5)
shape after: (1940, 7)
shape before: (1679, 5)
shape after: (228, 7)
Now shape: (84, 7)


In [171]:
usr_df.to_csv("../usr_df_final.csv", index=False)

In [201]:
agt_df.to_csv("../agt_df_final.csv", index=False)

# Test

In [48]:
for nl in df_new[(df_new.act1=='expl-conf')|(df_new.act2=='expl-conf')|(df_new.act3=='expl-conf')].nl:
    print(nl)

Let me confirm , You are looking for a restaurant in the $pricerange$ price range right?
Did you say you are looking for a restaurant in the $area$ of town?
You are looking for a $food$ restaurant right?
You are looking for a restaurant serving any kind of food right?
Let me confirm , You are looking for a restaurant and you dont care about the price range right?
Ok , a restaurant in any part of town is that right?


In [50]:
for nl in df_new[(df_new.act1=='impl-conf')|(df_new.act2=='impl-conf')|(df_new.act3=='impl-conf')].nl:
    print(nl)

There are restaurants serving $food$ in the $pricerange$ price range . What $area$ would you like?
There are restaurants serving $food$ food . What $area$ do you want?
There are restaurants in the $pricerange$ price range and the $area$ of town . What type of $food$ would you like?


In [46]:
set(df_new.act1)|set(df_new.act2)|set(df_new.act3)

{'',
 'canthelp',
 'canthelp.exception',
 'confirm-domain',
 'expl-conf',
 'impl-conf',
 'inform',
 'offer',
 'reqmore',
 'request',
 'select',
 'welcomemsg'}

In [44]:
df_new.head()

Unnamed: 0,nl,act1,act2,act3,slot1,slot2,slot3
0,"Hello , welcome to the Cambridge restaurant sy...",welcomemsg,,,,,
1,What kind of $food$ would you like?,request,,,food,,
2,Sorry there is no swedish restaurant in the $p...,canthelp,,,pricerange,,
3,$name$ is a great restaurant serving $food$ fo...,offer,inform,inform,name,food,pricerange
4,Can I help you with anything else?,reqmore,,,,,


In [45]:
df_new_usr.head()

Unnamed: 0,nl,act1,act2,act3,slot1,slot2,slot3
0,hello and welcome to the cambridge,hello,,,,,
1,$pricerange$ly priced $food$ food,inform,inform,,food,pricerange,
2,how about $food$,reqalts,inform,,food,,
3,is there anything else,reqalts,,,,,
4,could i have the $addr$ and $phone$,request,request,,addr,phone,
