In [1]:
import sys
sys.path.append('../')

BERT_FINE_TUNE_PATH = '../../bert_fine_tune/'
sys.path.append(BERT_FINE_TUNE_PATH)

In [2]:
# from textpair.preprocess.dummy_preprocessor import DummyPreprocessor
# from textpair.analyze.bert_analyzer import BertAnalyzer
# from textpair.vectorize.bert_vectorizer import BertVectorizer
# from textpair.model.bert_model import BertModel
# from textpair.pair_ann import PairAnn
# from textpair.semantic.base_semantic import BaseSemantic

import logging

logging.basicConfig(level=logging.INFO)

import torch
from pytorch_pretrained_bert.modeling_fine_tune import BertForPairWiseClassification
from pytorch_pretrained_bert import BertTokenizer

INFO:pytorch_pretrained_bert.modeling:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
INFO:pytorch_pretrained_bert.modeling_fine_tune:Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
FINE_TUNED_PATH = '/efs/fine_tune/atec_ccks_fine_tune_5/'

In [4]:
model = BertForPairWiseClassification.from_pretrained(FINE_TUNED_PATH)
tokenizer = BertTokenizer.from_pretrained(FINE_TUNED_PATH)

INFO:pytorch_pretrained_bert.modeling_fine_tune:loading weights file /efs/fine_tune/atec_ccks_fine_tune_5/pytorch_model.bin
INFO:pytorch_pretrained_bert.modeling_fine_tune:loading configuration file /efs/fine_tune/atec_ccks_fine_tune_5/config.json
INFO:pytorch_pretrained_bert.modeling_fine_tune:Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}

INFO:pytorch_pretrained_bert.tokenization:loading vocabulary file /efs/fine_tune/atec_ccks_fine_tune_5/vocab.txt


In [5]:
def bert_sim(text1, text2):
    tokens1 = ['[CLS]'] + tokenizer.tokenize(text1) + ['[SEP]']
    tokens2 = ['[CLS]'] + tokenizer.tokenize(text2) + ['[SEP]']
    ids1 = tokenizer.convert_tokens_to_ids(tokens1)
    ids2 = tokenizer.convert_tokens_to_ids(tokens2)
    segs1 = [0] * len(ids1)
    segs2 = [0] * len(ids2)
    tokens_tensor1 = torch.tensor([ids1])
    segments_tensor1 = torch.tensor([segs1])
    tokens_tensor2 = torch.tensor([ids2])
    segments_tensor2 = torch.tensor([segs2])
    model.eval()
    with torch.no_grad():
        cos_sim, pos_prob, vec1, vec2 = model(tokens_tensor1, tokens_tensor2, segments_tensor1, segments_tensor2)
    return cos_sim.item(), pos_prob.item(), vec1, vec2

In [6]:
text1 = "我很高兴"
text2 = "我很开心"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.48099759221076965, 0.7404987812042236)

In [7]:
text1 = "我很高兴"
text2 = "我特别特别开心"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.5408033132553101, 0.770401656627655)

In [8]:
text1 = "我很高兴"
text2 = "我其实觉得自己很开心"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.3133081793785095, 0.6566541194915771)

In [9]:
text1 = "我特别特别开心"
text2 = "我其实觉得自己很开心"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.7104029059410095, 0.8552014827728271)

In [10]:
text1 = "我很高兴"
text2 = "我不开心"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(-0.19968432188034058, 0.4001578390598297)

In [11]:
text1 = "车头如何放置车牌"
text2 = "前牌照怎么装"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.6175767779350281, 0.8087884187698364)

In [12]:
text1 = "车头如何放置车牌"
text2 = "如何办理北京车牌"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.4713800251483917, 0.7356899976730347)

In [13]:
text1 = "车头如何放置车牌"
text2 = "后牌照怎么装"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.5527295470237732, 0.776364803314209)

In [14]:
text1 = "我很高兴"
text2 = "我不高兴"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.10790378600358963, 0.5539519190788269)

In [15]:
text1 = "我很高兴"
text2 = "我很高兴"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.9998999834060669, 0.9999499917030334)

In [16]:
text1 = "为什么能开出腾讯信用却没有微粒贷朋友的没用腾讯信用却有30000的额度呢"
text2 = "我钱包里没有你们这个应用"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.47118473052978516, 0.7355923652648926)

In [17]:
text1 = "我也不知道"
text2 = "好吧"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(-0.26280340552330017, 0.3685982823371887)

In [18]:
text1 = "深度学习"
text2 = "机器学习"
cos_sim, pos_prob, vec1, vec2 = bert_sim(text1, text2)
cos_sim, pos_prob

(0.39961937069892883, 0.6998096704483032)