In [None]:
#coding=utf-8
import numpy as np
from sklearn.model_selection import train_test_split
from keras import layers as KL
from keras_contrib import layers as KCL

## Load image labels

In [None]:
def load_gt(path):
    d = {}
    for line in open(path):
        lineL = line.strip().split("\t")
        img_name = lineL[0]
        boxes = lineL[1].split(" ")
        if img_name not in d:
            d[img_name] = []
        for box in boxes:
            boxL = box.split("\001")
            box_name = boxL[0]
            loc = [float(x) for x in boxL[1].split(",")]
            d[img_name].append([box_name, loc])
    return d

componnet_path = "data/laowang_component_train.txt" 
detail_path = "data/laowang_detail_train.txt"

componnet_labels = load_gt(componnet_path)

### 构建transactions

In [None]:
name_list = [sorted(v,key=lambda x:(x[1][0],x[1][1])) for k,v in componnet_labels.iteritems()]
transactions = name_list
print("transactions length:", len(transactions))

trans = list(map(lambda x: [a[0] for a in x], name_list))
print("trans length:", len(trans))
trans

### 将所有方向和部件分开

In [None]:
def get_name_direct(part_name):
    x = part_name.split('_')
    direct, part = "_".join(x[:-1]), x[-1]
    return direct, part

parts = []
directions = []
parts_labels = set()
direct_labels = set()
for x in trans:
    part_tmp, label_tmp = [],[]
    for part_name in x:
        direct, part = get_name_direct(part_name)
        
        part_tmp.append(part)
        label_tmp.append(direct)
        
        parts_labels.add(part)
        direct_labels.add(direct)
    if part_tmp in parts:
        continue
    parts.append(part_tmp)
    directions.append(label_tmp)
    
print("parts:{} directions:{}".format(len(parts), len(directions)))


In [None]:
parts_labels = ['#']+sorted(list(parts_labels))
parts2id_map = dict([(v, id) for id, v in enumerate(parts_labels)])

direct_labels = sorted(list(direct_labels))
direct2id_map = dict([(v, id) for id, v in enumerate(direct_labels)])

### 部件 <-> id  和   方向 <-> id

In [None]:
part2id_func = lambda x: parts2id_map[x]
id2part_func = lambda x: parts_labels[x]

direct2id_func = lambda x: direct2id_map[x]
id2direct_func = lambda x: direct_labels[x]

### 将部件和方向全部转换为id表示

In [None]:
X = [ list(map(part2id_func, x)) for x in parts]
Y = [ list(map(direct2id_func, x)) for x in directions]

### 把训练集X, Y所有padding成最长长度

In [None]:
real_length = list(map(len, X))
max_len = max(real_length)
X = [np.array(x, dtype=np.int32) for x in X]
Y = [np.array(x, dtype=np.int32) for x in Y]
X = np.array([np.pad(x,(0,max_len-len(x)),'constant') for x in X], dtype=np.int32)
Y = np.array([np.pad(x,(0,max_len-len(x)),'constant') for x in Y], dtype=np.int32)

### 训练数据生成器

In [None]:
def gen_train_batch(X, Y, batch_size=64):
    data_size = X.shape[0] // batch_size
    while True:
        inx = np.arange(X.shape[0])
        np.random.shuffle(inx)
        X = X[inx,...]
        Y = Y[inx,...]
        for it in range(data_size):
            x = X[it*batch_size:(it+1)*batch_size,...]
            y = Y[it*batch_size:(it+1)*batch_size,...]
            yield x,y[:,:,np.newaxis]

x, y = gen_train_batch(X,Y).next()
print("x:{} y:{}".format(x.shape, y.shape))

###  划分训练集和测试集

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("X_train:{} y_train:{}\nX_test:{} y_test:{}".format(X_train.shape[0],y_train.shape[0],X_test.shape[0],y_test.shape[0]))

### BiLSTM+CRF

In [None]:

from keras.models import Model

embedding_size = 128

inputs = KL.Input(shape=(38,))
x =  KL.Embedding(len(parts_labels), embedding_size, input_length=38, mask_zero=True)(inputs)
x = KL.Bidirectional(KL.LSTM(512, return_sequences=True))(x)

crf = KCL.CRF(len(direct_labels), sparse_target=True)

x = crf(x)
print x.get_shape()

model = Model(inputs,x)
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

### 训练模型

In [None]:
train_gen = gen_train_batch(X_train,y_train)
test_gen = gen_train_batch(X_test, y_test)

model.fit_generator(train_gen, steps_per_epoch=1000, epochs=50, verbose=1, callbacks=None, validation_data=test_gen, validation_steps=100, class_weight=None, max_queue_size=10, workers=4, use_multiprocessing=True, shuffle=True, initial_epoch=0)

### 保存模型

In [None]:
model.save('biLSTM_CRF.h5')

### 测试模型输出

In [None]:
def display(x, dtype):
    assert dtype in ['direct', 'part']
    assert len(x.shape) == 2, 'invalid shape for input' # n, parts_or_direction_number
    n, m = x.shape
    view = []
    for i in range(n):
        cv = []
        for j in range(m):
            name = id2part_func(x[i,j]) if dtype=='part' else id2direct_func(x[i,j])
            cv.append(name)
        view.append(cv)
    return view
    
def display_total(x, y):
    view_x = display(x, 'part')
    view_y = display(y, 'direct')
    lst = []
    for a,b in zip(view_x, view_y):
        tmp = []
        for item, ori in zip(a,b):
            if item=="#":
                continue
            tmp.append(ori+"_"+item if ori else item)
        lst.append(tmp)
    return lst
            

In [None]:
# 从测试集中随机取几条数据
grabs = 1
test_size = X_test.shape[0]
randstart = np.random.randint(test_size-grabs)
x = X_test[randstart:randstart+grabs,...] #grabs, 38
y = y_test[randstart:randstart+grabs,...] #grabs, 38
x1 = np.array([[ 2,  9, 14, 10,  8, 11, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]], dtype=np.int32)
# x1 = np.stack([x[0][:1], x[0][2:]], axis=-1)
# display(x, 'part')
# display(y, 'direct')
display_total(x, y)

In [None]:
from keras_contrib import *
from keras.models import load_model
from keras_contrib.layers import CRF
from keras.models import Model

def build_model():
    embedding_size = 128
    inputs = KL.Input(shape=(38,))
    x =  KL.Embedding(len(parts_labels), embedding_size, input_length=38, mask_zero=True)(inputs)
    x = KL.Bidirectional(KL.LSTM(512, return_sequences=True))(x)
    crf = KCL.CRF(len(direct_labels), sparse_target=True)
    x = crf(x)
    model = Model(inputs,x)
    return model


model = build_model()
model.load_weights('biLSTM_CRF.h5')
model

### 预测

In [None]:
y_hat = model.predict(x1) 
l = np.argmax(y_hat, axis=-1)
display_total(x,l)

## 挖掘频繁模式

### 构建transactions

### 挖掘transaction中频繁模式

In [None]:
minsup = 5000
for itemset in find_frequent_itemsets(transactions, minsup):
    print itemset

In [None]:
## 同时有左和右的部件数

In [None]:
transacs_with_lr = []
transacs_with_l=[]
transacs_with_r=[]
for ts in transactions:
    #if any(["left" in x for x in ts]) and any(["right" in x for x in ts]):
    a = [0,0]
    tmp = []
    for x in ts:    
        if "left" in x:
            a[0]=1
            tmp.append(x)
        if "right" in x:
            a[1]=1
            tmp.append(x)
    if a==[1,1]:
        transacs_with_lr.append(tmp)
print("with left and right transactions:", len(transacs_with_lr))

In [None]:
### 挖掘同时有左右最常出现的部件数目

In [None]:
# transet =transacs_with_lr)
minsup = 1
for itemset in find_frequent_itemsets(transacs_with_lr, minsup):
    if any(["right" in x for x in itemset]) and any(['left' in x for x in itemset]):
        if itemset in transacs_with_lr:
            print itemset