In [1]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli',
	'HF_TASK':'zero-shot-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.6.1',
	pytorch_version='1.7.1',
	py_version='py36',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.g4dn.2xlarge' # ec2 instance type
)

predictor.predict({
	'inputs': "Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!",
    "parameters": {
        "candidate_labels": [
            "refund",
            "legal",
            "faq"
        ]
    }
})

--------!

{'sequence': 'Hi, I recently bought a device from your company but it is not working as advertised and I would like to get reimbursed!',
 'labels': ['refund', 'legal', 'faq'],
 'scores': [0.7119424343109131, 0.14697127044200897, 0.14108633995056152]}

In [5]:
def get_res(model, querys, candidate_labels, seg_len):
#     import time
    if len(candidate_labels)<seg_len:
        return model.predict({'inputs': querys,"parameters": {"candidate_labels": candidate_labels}})['labels'][0]
    else:
        candidate_labels_small=[]
        seg=len(candidate_labels)//seg_len
        for i in range(seg):
            candidate_labels_small.append(model.predict({'inputs': querys,"parameters": {"candidate_labels": candidate_labels[i*seg_len:(i+1)*seg_len]}})['labels'][0])
#             time.sleep(2)
        if len(candidate_labels)%seg_len>0:
            candidate_labels_small.append(model.predict({'inputs': querys,"parameters": {"candidate_labels": candidate_labels[seg*seg_len:]}})['labels'][0])
        return model.predict({'inputs': querys,"parameters": {"candidate_labels": candidate_labels_small}})['labels'][0]

In [8]:
predictor.predict({
	'inputs': ["得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053"],
    "parameters": {
        "candidate_labels": [
            "小号美工刀",
            "切纸刀/21X16木质/8001/得力",
            '大桶装液体胶水7310'
        ]
    }
})

{'sequence': '得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053',
 'labels': ['小号美工刀', '大桶装液体胶水7310', '切纸刀/21X16木质/8001/得力'],
 'scores': [0.9952664971351624, 0.002810346195474267, 0.001923118019476533]}

In [111]:
from sagemaker.huggingface import HuggingFaceModel, HuggingFacePredictor
import sagemaker

huggingface_predictor=HuggingFacePredictor(
    endpoint_name='huggingface-pytorch-inference-2022-03-15-12-09-15-867',
)

# 模型匹配正确

In [37]:
print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053",
    "parameters": {
        "candidate_labels": [
            "小号美工刀",
            "切纸刀/21X16木质/8001/得力",
            "美工刀片/NO.2011/100*18*0.5mm/得力",
            "美工刀/2003/8刀头/得力"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 小号美工刀")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053
Label: 小号美工刀
pred: 小号美工刀 score: 0.9296925663948059
pred: 美工刀/2003/8刀头/得力 score: 0.0624050609767437
pred: 美工刀片/NO.2011/100*18*0.5mm/得力 score: 0.006105995737016201
pred: 切纸刀/21X16木质/8001/得力 score: 0.0017964026192203164


In [45]:
print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "得力(deli)502强力胶 快干无色胶水 8g/支 10瓶装 办公用品 7144",
    "parameters": {
        "candidate_labels": [
            "大桶装液体胶水7310",
            "502强力胶8g/支",
            "液体胶水/7302/70ml/得力",
            "液体胶水/7302/50ml/得力"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 502强力胶8g/支")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 得力(deli)502强力胶 快干无色胶水 8g/支 10瓶装 办公用品 7144
Label: 502强力胶8g/支
pred: 502强力胶8g/支 score: 0.9326900243759155
pred: 大桶装液体胶水7310 score: 0.03859197720885277
pred: 液体胶水/7302/70ml/得力 score: 0.015977470204234123
pred: 液体胶水/7302/50ml/得力 score: 0.01274050585925579


# 模型匹配正确但是label不正确

In [38]:

print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "得力(deli)20cm办公通用直尺 测量绘图尺子 办公用品 6220",
    "parameters": {
        "candidate_labels": [
            "办公用尺",
            "外径千分尺(200-225mm)",
            "多功能数字显示角度测量仪(BOSCH \GAM220)",
            "数显卡尺(0-200mm)",
            "有机直尺/30cm/得力"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 有机直尺/30cm/得力")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 得力(deli)20cm办公通用直尺 测量绘图尺子 办公用品 6220
Label: 有机直尺/30cm/得力
pred: 办公用尺 score: 0.9851844906806946
pred: 外径千分尺(200-225mm) score: 0.004220536909997463
pred: 多功能数字显示角度测量仪(BOSCH \GAM220) score: 0.0041100019589066505
pred: 有机直尺/30cm/得力 score: 0.003446540329605341
pred: 数显卡尺(0-200mm) score: 0.0030384077690541744


In [40]:

print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "晨光(M&G)文具G-5黑色0.5mm按动子弹头中性笔芯 签字笔替芯 水笔芯 K35/S01/S08适用 20支/盒",
    "parameters": {
        "candidate_labels": [
            "签字笔/K-35/按动0.5MM芯 黑色/晨光",
            "外径千分尺(200-225mm)",
            "中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光",
            "中性笔芯/G-5/0.5MM芯 按动款红色/晨光",
            "中性笔芯/G-5/0.5MM芯 按动款黑色/晨光"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 晨光(M&G)文具G-5黑色0.5mm按动子弹头中性笔芯 签字笔替芯 水笔芯 K35/S01/S08适用 20支/盒
Label: 中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光
pred: 中性笔芯/G-5/0.5MM芯 按动款黑色/晨光 score: 0.4832707643508911
pred: 签字笔/K-35/按动0.5MM芯 黑色/晨光 score: 0.2497372329235077
pred: 中性笔芯/G-5/0.5MM芯 按动款红色/晨光 score: 0.2118282914161682
pred: 中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光 score: 0.05416397377848625
pred: 外径千分尺(200-225mm) score: 0.0009997383458539844


In [113]:

print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "齐心(Comix) 10个装 55mm牢固耐用粘扣档案盒/A4文件盒/资料盒 EA1002-10 蓝色",
    "parameters": {
        "candidate_labels": [
            "中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光",
            "档案盒/HC-55/A4,2寸,75mm、PP/齐心",
            "档案盒/HC-55/A4,3寸,55mm、PP/齐心",
            "档案盒/HC-55/A5,3寸,55mm、PP/齐心",
            "档案盒/HC-55/A4,3寸,65mm、PP/齐心"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 档案盒/HC-75/A4,3寸,75mm、PP/齐心")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 齐心(Comix) 10个装 55mm牢固耐用粘扣档案盒/A4文件盒/资料盒 EA1002-10 蓝色
Label: 档案盒/HC-75/A4,3寸,75mm、PP/齐心
pred: 档案盒/HC-55/A4,3寸,55mm、PP/齐心 score: 0.49754753708839417
pred: 档案盒/HC-55/A5,3寸,55mm、PP/齐心 score: 0.25272226333618164
pred: 档案盒/HC-55/A4,3寸,65mm、PP/齐心 score: 0.12165962904691696
pred: 档案盒/HC-55/A4,2寸,75mm、PP/齐心 score: 0.12124229222536087
pred: 中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光 score: 0.006828295532613993


In [48]:

print('---模型匹配---')
out=huggingface_predictor.predict({
	'inputs': "接触器",
    "parameters": {
        "candidate_labels": [
            "其他电器元件及附件",
            "其他日化用品",
            "其他电力电缆",
            "其他杂品",
            "护肤用品"
        ]
    }
})
print("Sents:", out['sequence'])
print("Label: 其他电力电缆")
for i in range(len(out['labels'])):
    print("pred:",out['labels'][i],"score:",out['scores'][i])

---模型匹配---
Sents: 接触器
Label: 其他电力电缆
pred: 其他电力电缆 score: 0.3926471769809723
pred: 其他电器元件及附件 score: 0.20877528190612793
pred: 其他日化用品 score: 0.167989119887352
pred: 其他杂品 score: 0.15777894854545593
pred: 护肤用品 score: 0.07280945777893066


In [142]:
import pandas as pd
data = pd.read_excel('./data/客户物料数据.xlsx')
data.columns

Index(['客户主营业务', '商品名称', '平台分类', '物料编码', '物料名称', '物料品类'], dtype='object')

In [145]:
print(data['客户主营业务'].unique())
df=data[data['客户主营业务']=='电力机车、城轨车辆']
candidate_labels=list(df["物料名称"].unique())
queries=list(df["商品名称"])
labels=list(df["物料名称"])
len(candidate_labels)

['电力机车、城轨车辆' '物业管理' '钢材、有色金属、各类建材、化工原料（不含危险品）、机械设备及零配件' '家居制造']


2038

# prompt推理

In [6]:
import time
import pandas as pd
data=pd.read_excel('./data/客户物料数据.xlsx')
zhbm=data['客户主营业务'].unique()
res={}
model=predictor
for i in zhbm:
    res[i]={}
    print(i)
    df=data[data['客户主营业务']==i]
    candidate_labels=list(df["物料名称"].unique())
    queries=list(df["商品名称"])
    labels=list(df["物料名称"])
    for j in range(len(queries)):
        res[i][queries[j]]={}
        res[i][queries[j]]['prompt res']=get_res(model=model, querys=queries[j], candidate_labels=candidate_labels, seg_len=500)
        res[i][queries[j]]['label']=labels[j]
        break
    break
# import numpy as np
# np.save('prompt_final_res.npy', res) 

电力机车、城轨车辆


In [7]:
res

{'电力机车、城轨车辆': {'得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053': {'prompt res': '小号美工刀',
   'label': '小号美工刀'}}}

In [26]:
# candidate_labels.append('其他电力电缆')
candidate_labels[-1]

'其他电力电缆'

In [None]:
out=predictor.predict({
	'inputs': "接触器",
    "parameters": {
        "candidate_labels": candidate_labels[:10]
    }
})


In [129]:
import numpy as np

res=np.load('t50.npy',allow_pickle=True).item()

In [None]:
# res
for key,val in res.items():
    print(key)
    for k,v in res[key].items():
        candidate_labels=v['pred']
        queries=k
        out=predictor.predict({
        'inputs': queries,
        "parameters": {
            "candidate_labels": candidate_labels}})
        res[key][k]['prompt pred']=out['labels'][0]
np.save('prompt_res.npy', res)

电力机车、城轨车辆


In [132]:
import pandas as pd
data = pd.read_excel('./data/客户物料数据.xlsx')
data['prompt']=''
r={}
i=0
customer, querys, labs, preds, prompt_preds=[],[],[],[],[]
for key,val in res.items():
    for k,v in val.items():
        customer.append(key)
        querys.append(k)
        labs.append(v['labels'])
        preds.append(v['pred'][0])
        prompt_preds.append(v['prompt pred'])


In [136]:
R={'客户主营业务':customer, '商品名称':querys, '物料名称':labs, 'zero-shot预测':preds, 'prompt预测':prompt_preds}
df=pd.DataFrame(R)
# df.to_csv('物料结果.csv',index=False)
df.head()

Unnamed: 0,客户主营业务,商品名称,物料名称,zero-shot预测,prompt预测
0,电力机车、城轨车辆,得力(deli)小号金属外壳办公家用美工刀/裁纸刀 办公用品 2053,小号美工刀,切纸刀/21X16木质/8001/得力,美工刀/2003/8刀头/得力
1,电力机车、城轨车辆,得力(deli)20cm办公通用直尺 测量绘图尺子 办公用品 6220,有机直尺/30cm/得力,办公用尺,办公用尺
2,电力机车、城轨车辆,晨光(M&G)文具G-5黑色0.5mm按动子弹头中性笔芯 签字笔替芯 水笔芯 K35/S01...,中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光,中性笔芯/G-5/0.5MM芯 按动款黑色/晨光,中性笔芯/G-5/0.5MM芯 按动款黑色/晨光
3,电力机车、城轨车辆,得力(deli)10只A4透明抽杆文件夹拉杆夹 简历报告夹商务会议资料产检报告收纳 33223白色,抽杆夹/Q310/A4，容纸量30张/齐心,A4透明抽杆夹,加厚透明文件袋
4,电力机车、城轨车辆,齐心(Comix) 10个装 55mm牢固耐用粘扣档案盒/A4文件盒/资料盒 EA1002-...,"档案盒/HC-75/A4,3寸,75mm、PP/齐心","档案盒/HC-55/A4,2寸,55mm、PP/齐心","档案盒/HC-55/A4,2寸,55mm、PP/齐心"


In [138]:
correct={}
for i in range(len(df)):
    if df.loc[i,'客户主营业务'] not in correct.keys():
        correct[df.loc[i,'客户主营业务']]={'nums':0, 'prompt true':0, 'zs_true':0}
    if df.loc[i,'物料名称']==df.loc[i,'zero-shot预测']:
        correct[df.loc[i,'客户主营业务']]['zs_true']+=1
    if df.loc[i,'物料名称']==df.loc[i,'prompt预测']:
        correct[df.loc[i,'客户主营业务']]['prompt true']+=1
    correct[df.loc[i,'客户主营业务']]['nums']+=1
correct

{'电力机车、城轨车辆': {'nums': 2985, 'prompt true': 1211, 'zs_true': 945},
 '物业管理': {'nums': 50535, 'prompt true': 27796, 'zs_true': 20979},
 '钢材、有色金属、各类建材、化工原料（不含危险品）、机械设备及零配件': {'nums': 24057,
  'prompt true': 12819,
  'zs_true': 10675},
 '家居制造': {'nums': 7369, 'prompt true': 4950, 'zs_true': 4713}}

In [141]:
total_num=0
total_corr_prompt=0
total_corr_zs=0
for k,v in correct.items():
    total_num+=v['nums']
    total_corr_prompt+=v['prompt true']
    total_corr_zs+=v['zs_true']
    correct[k]['zs_acc']=correct[k]['zs_true']/correct[k]['nums']
    correct[k]['pro_acc']=correct[k]['prompt true']/correct[k]['nums']
print(total_num, total_corr_prompt, total_corr_zs)
correct
# total_num
# total_corr_prompt
# total_corr_zs

84946 46776 37312


{'电力机车、城轨车辆': {'nums': 2985,
  'prompt true': 1211,
  'zs_true': 945,
  'zs_acc': 0.3165829145728643,
  'pro_acc': 0.40569514237855947},
 '物业管理': {'nums': 50535,
  'prompt true': 27796,
  'zs_true': 20979,
  'zs_acc': 0.4151380231522707,
  'pro_acc': 0.5500346294647274},
 '钢材、有色金属、各类建材、化工原料（不含危险品）、机械设备及零配件': {'nums': 24057,
  'prompt true': 12819,
  'zs_true': 10675,
  'zs_acc': 0.44373778941680175,
  'pro_acc': 0.5328594587853848},
 '家居制造': {'nums': 7369,
  'prompt true': 4950,
  'zs_true': 4713,
  'zs_acc': 0.6395711765504138,
  'pro_acc': 0.6717329352693717}}

In [131]:
res['电力机车、城轨车辆']['晨光(M&G)文具G-5黑色0.5mm按动子弹头中性笔芯 签字笔替芯 水笔芯 K35/S01/S08适用 20支/盒']

{'labels': '中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光',
 'pred': ['中性笔芯/G-5/0.5MM芯 按动款黑色/晨光',
  '签字笔/K-35/按动0.5MM芯 黑色/晨光',
  '中性笔芯/G-5/0.5MM芯 按动款红色/晨光',
  '中性笔芯/G-5/0.5MM芯 按动款蓝色/晨光',
  '中性笔芯替换芯/0.5mm/黑色/MG-13/晨光',
  '中性笔芯/007/0.5MM芯 直筒黑色/晨光',
  '签字笔/K-35/按动0.5MM芯红色/晨光',
  '签字笔/K-35/按动0.5MM芯/蓝色/晨光',
  '中性笔芯/MG-6128/0.7MM芯 直筒黑色/晨光',
  '子弹头笔芯/S760/0.5mm/黑色/得力',
  '中性笔/S25/0.5mm黑色/得力',
  '中性笔芯/007/0.5MM芯 直筒蓝色/晨光',
  '中性笔芯/007/0.5MM芯 直筒红色/晨光',
  '全针管笔芯/A708/0.35mm笔芯/黑色/得力',
  '记号笔/MG-2110（大号）/黑色/晨光',
  '中性笔芯/MG-6128/0.7MM芯 直筒蓝色/晨光',
  '中性笔芯/MG-6128/0.7MM芯 直筒红色/晨光',
  '台笔/6791/0.35mm芯黑色/得力',
  '圆珠笔/41701/0.7MM 按动款黑色/晨光',
  '直液式走珠笔签字笔 0.5mm 5支/袋',
  '高级信纸/XG-B502/B5/174*248mm',
  '4色圆珠笔/MF-1006/0.5mm/晨光',
  '墨盒/T057黑色/用于爱普生ME1+/爱普生',
  '记号笔/MG-2110（大号）/蓝色/晨光',
  '墨盒/5BK黑色/用于佳能XI4000、5000/佳能',
  '签字笔/GP-1150/黑色/晨光',
  '记号笔/MG-2110（大号）/红色/晨光',
  '记号笔/MK804（小号）/黑色/齐心',
  '签字笔/UB-150/直液式0.5签字笔黑色/三菱',
  '202a硒鼓适用惠普M281FDW硒鼓 M254DW/NW M281FDN M280NW cf500a黑色带芯片打印机粉盒',
  'PG-815黑色墨盒(适用iP2780/iP2788/MP236/MP288）',
  

In [None]:
print(111)