In [1]:
import openpyxl
import json
from collections import Counter
import pandas as pd
import re
from collections import Counter, defaultdict
import random
from copy import deepcopy
import pdb

In [2]:
def load_xlsx_data(filename, sheetname="Sheet1", header=True):
    """ Load xls data
    """
    workbook = openpyxl.load_workbook(filename)
    worksheet = workbook[sheetname]
    headers = []
    contents = []
    for idx, row in enumerate(worksheet.iter_rows()):
        row_values = [cell.value for cell in row]
        if idx == 0 and header:
            headers = row_values
            continue
        if header:
            row_values = dict(zip(headers, row_values))
        contents.append(row_values)
    return contents


RGX_INTENTS = re.compile('intents":(.*?])}]')


def find_dr(target, rgx=RGX_INTENTS):
    res = rgx.findall(target)
    if not res:
        return []
    targ_str = res[0]
    try:
        res = json.loads(targ_str)
    except:
        res = []
    return res
    

def convert_data(rec):
    rank_order = [x.split("|") for x in rec["rank_order"].split("\n") if x] if rec["rank_order"] else []
    dialog_root = find_dr(rec["dialog_root_responsebody"])
    try:
        nlu_model = json.loads(rec["nlu_model_nlp"])
    except:
        nlu_model = {}
    try:
        grammar_body = json.loads(rec["grammar_process_responsebody"])
    except:
        grammar_body = {}
    try:
        model_top1 = json.loads(rec["result"])
    except:
        model_top1 = []
    res = {
        "model_topn": nlu_model,
        "query": grammar_body.get("query", ""),
        "entities": grammar_body.get("entities", []),
        "grammar_topn": grammar_body.get("intents", []),
        "rank_order": [{"domain": x[0], "intent": x[1]} for x in rank_order if len(x) == 2],
        "use_nlu_model": rec["should_use_this"],
        "dialog_root": dialog_root,
        "traceid": rec["p_traceid"],
        "model_top1": model_top1
    }
    return res

In [3]:
# raw_data = load_xlsx_data("./data_0615.xlsx", sheetname="Sheet")
raw_data = load_xlsx_data("‪D:\Desktop\MINI-NLU-全量数据(6.15-6.21).xlsx", sheetname="Sheet")

OSError: [Errno 22] Invalid argument: '\u202aD:\\Desktop\\MINI-NLU-全量数据(6.15-6.21).xlsx'

In [4]:
recs = [x for x in raw_data if "怎么说呢" in x["grammar_process_responsebody"]]

In [5]:
recs[0]["result"]

'null'

In [6]:
raw_data[0].keys()

dict_keys(['dates', 'p_traceid', 'b_client_id', 'nlu_model_nlp', 'nlu_model_tags', 'grammar_process_lid', 'grammar_process_responsebody', 'grammar_process_tags', 'dialog_root_responsebody', 'dialog_root_tags', 'standard_log_lid', 'should_use_this', 'result', 'rank_order', 'standard_log_tags'])

In [7]:
raw_data[0]["result"]

'[{"intent_id":60298221125632,"intent_name":"next_song","grammar_pkg_id":60110064582656,"grammar_pkg_name":"music_2"},{"intent_id":60212523106304,"intent_name":"stop_music","grammar_pkg_id":60110064582656,"grammar_pkg_name":"music_2"},{"intent_id":60200175075328,"intent_name":"collect_music","grammar_pkg_id":60110064582656,"grammar_pkg_name":"music_2"},{"intent_id":60300637044736,"intent_name":"pause_music","grammar_pkg_id":60110064582656,"grammar_pkg_name":"music_2"},{"intent_id":60996413358080,"intent_name":"cancel_pause","grammar_pkg_id":60110064582656,"grammar_pkg_name":"music_2"},{"intent_id":75543887020032,"intent_name":"next_one","grammar_pkg_id":74049716486144,"grammar_pkg_name":"fm_2"},{"intent_id":76090908147712,"intent_name":"pause_book","grammar_pkg_id":74049716486144,"grammar_pkg_name":"fm_2"}]'

In [8]:
data = [convert_data(x) for x in raw_data]

In [13]:
data[0]

{'model_topn': [{'domain': 'music_2',
   'domain_confidence': 0.2065281867980957,
   'intent': 'cancel_pause',
   'intent_confidence': 0.6017264127731323,
   'slots': {},
   'slots_confidence': 0.9999412298202515},
  {'domain': 'fm_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'pause_book',
   'intent_confidence': 0.9841967225074768,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'music_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'next_song',
   'intent_confidence': 0.807354211807251,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'fm_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'next_one',
   'intent_confidence': 0.807354211807251,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'music_2',
   'domain_confidence': 0.1305018663406372,
   'intent': 'pause_music',
   'intent_confidence': 0.9904394149780273,
   'slots': {},
   'slots_confidence': 0.9999434947967529},
  {'domain': 'music_2',
   'domain_confidence':

In [10]:
def clean_str(target):
    if not target:
        return ""
    res = target.strip().lower()
    res = re.sub(r'^[^\w\s]', '', res)
    res = re.sub(r'[^\w\s]$', '', res)
    return res

print(clean_str(" 今天天气好不好？ "))


AWAKENS = ["小豹", "小雅"]

def all_awakens(target, awakens=AWAKENS):
    if len(target) % 2 != 0:
        return False
    parts = [target[i: i + 2] for i in range(0, len(target), 2)]
    parts = set(parts)
    return all(x in awakens for x in parts)



def get_top1_domain(rec):
    result = "other"
    for intent in rec.get("dialog_root", []):
        domain = intent["domain_name"]
        if domain.startswith("music_"):
            result = "music"
            break
        if domain.startswith("fm_"):
            result = "fm"
            break
        if domain != "other":
            result = "XXX"
    if result != "other":
        return result
    for intent in rec.get("model_topn", []):
        domain = intent["domain"]
        if domain.startswith("music_"):
            result = "music"
            break
        if domain.startswith("fm_"):
            result = "fm"
            break
        if domain != "other":
            result = "XXX"
    return result


def sample_records(data):
    valid_count = 0
    t2idx = defaultdict(list)
    qcounts = Counter()
    for idx, rec in enumerate(data):
        query = rec.get("query", "")
        norm_query = clean_str(query)
        rec["norm_query"] = norm_query
        if not norm_query:
            t2idx["empty"].append(idx)
            continue
        if all_awakens(norm_query):
            t2idx["awakens"].append(idx)
            continue
        qcounts[norm_query] += 1
        valid_count += 1
        domain = get_top1_domain(rec)
        t2idx[domain].append(idx)
    samples = []
    aggregates = 0
    currents = set()
    for domain, indexes in t2idx.items():
        if domain in ("empty", "awakens"):
            continue
        idxlen = len(indexes)
        aggregates += idxlen
        if aggregates >= valid_count:
            num = 2000 - len(samples)
        else:
            num = int(idxlen / valid_count * 2000)
        if not num:
            continue
        qset = set([data[x]["query"] for x in indexes])
        if len(qset) < num:
            pdb.set_trace()
        count = 0
        while count < num:
            idx = random.choice(indexes)
            rec = data[idx]
            query = rec["norm_query"]
            indexes.remove(idx)
            if query in currents:
                continue
            currents.add(query)
            rec = deepcopy(data[idx])
            rec = deepcopy(data[idx])
            rec["dtype"] = domain
            rec["dcount"] = idxlen
            rec["qcount"] = qcounts[query]
            samples.append(rec)
            count += 1
    return samples, t2idx, qcounts

今天天气好不好


In [83]:
samples, t2idx, qcounts = sample_records(data)

In [25]:
def get_domains(target):
    for intent in target:
        domain = intent.get("grammar_pkg_name", intent.get("domain_name", intent.get("domain")))
        if domain.startswith("music_"):
            yield "music"
            continue
        if domain.startswith("fm_"):
            yield "fm"
            continue
        if domain != "other":
            yield "XXX"
        else:
            yield "other"

def describe_data(data, target="dialog_root"):
    qcounts = Counter()
    for rec in data:
        qcounts["total"] += 1
        query = rec.get("query", "")
        norm_query = clean_str(query)
        rec["norm_query"] = norm_query
        if not norm_query:
            qcounts["empty"] += 1
            continue
        if all_awakens(norm_query):
            qcounts["awakens"] += 1
            continue
        qcounts["valid"] += 1
        domains = list(set(x for x in get_domains(rec.get(target, []))))
        if not domains:
            domains = ["other"]
        qcounts.update(domains)
    print(target, qcounts)
    return qcounts


_ = describe_data(data)
_ = describe_data(data, target="model_topn")

dialog_root Counter({'total': 9190, 'valid': 8673, 'XXX': 3988, 'other': 3322, 'fm': 1241, 'music': 1099, 'awakens': 378, 'empty': 139})
model_topn Counter({'total': 9190, 'valid': 8673, 'other': 4802, 'fm': 4010, 'music': 3999, 'XXX': 959, 'awakens': 378, 'empty': 139})


In [16]:
data[0]

{'model_topn': [{'domain': 'music_2',
   'domain_confidence': 0.2065281867980957,
   'intent': 'cancel_pause',
   'intent_confidence': 0.6017264127731323,
   'slots': {},
   'slots_confidence': 0.9999412298202515},
  {'domain': 'fm_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'pause_book',
   'intent_confidence': 0.9841967225074768,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'music_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'next_song',
   'intent_confidence': 0.807354211807251,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'fm_2',
   'domain_confidence': 0.19012224674224854,
   'intent': 'next_one',
   'intent_confidence': 0.807354211807251,
   'slots': {},
   'slots_confidence': 0},
  {'domain': 'music_2',
   'domain_confidence': 0.1305018663406372,
   'intent': 'pause_music',
   'intent_confidence': 0.9904394149780273,
   'slots': {},
   'slots_confidence': 0.9999434947967529},
  {'domain': 'music_2',
   'domain_confidence':

In [84]:
len(set(x["query"] for x in samples))

2000

In [85]:
def save_jsonl(data, filename, descs):
    with open(filename, "w") as fout:
        for desc in descs:
            line = json.dumps(desc, ensure_ascii=False)
            fout.write("#" + line + "\n")
        for rec in data:
            line = json.dumps(rec, ensure_ascii=False)
            fout.write(line)
            fout.write("\n")

In [86]:
save_jsonl(
    filename="2020-06-16.jsonl",
    data=samples,
    descs=[
        {k: len(v) for k, v in t2idx.items()},
        #qcounts
    ])