In [1]:
import numpy as np
import collections 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


In [7]:
#这是对Fasttext的一个简单尝试 包括tf版本\pytorch版本
def load_data(data_path):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_len = 0
    with open(data_path, 'r',encoding='utf-8') as f:
        for line in f.read().splitlines():
            line_list = line.split('\t')
            one_data = line_list[1].split(' ')
            tmp_len = len(one_data)
            if tmp_len > max_sentence_len:
                max_sentence_len = tmp_len
            data.append(one_data)
            labels.append(int(line_list[2]))
    print("max sentence length: ", max_sentence_len)
    return data, labels

data_path = './data/AI_law/sample_seg_train.txt'
data, labels = load_data(data_path)


max sentence length:  20420


In [11]:
def bulid_vocabulary(data,min_count =3):
    count = [('<UNK>', -1), ('<PAD>', -1)]
    words =[]
    [words.extend(line) for line in data]
    counter = collections.Counter(words)
    counter_list = counter.most_common()
    for word,c in counter_list:
        #记录最少出现三次的词
        if c>min_count:
            count.append((word,c))
        # 同理也可以限制最多出现的词的数目
    dict_word2index = {word:c for word,c in enumerate(count)}
    dict_index2word ={c:word for word,c in enumerate(count)}
    print("vocab size:", len(count))
    print(count[-1])
    return count, dict_word2index, dict_index2word
count, dict_word2index, dict_index2word =bulid_vocabulary(data)

vocab size: 43727
('马福才', 4)


In [15]:
#data[:5]
def build_dataset(data, labels, dict_word2index, max_sentence_len=1000, label_size=8):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        new_labels.append(labels[i]-1) 
        new_line = []
        for word in data[i]:
            if word in dict_word2index:
                index = dict_word2index[word]
            else:
                index = 0    # UNK
            new_line.append(index)
        
        zero_num = max_sentence_len - len(new_line)
        while zero_num > 0:
            new_line.append(0)
            zero_num -= 1
        dataset.append(new_line[:max_sentence_len])
#     return dataset, new_labels
    return np.array(dataset, dtype=np.int64), np.array(new_labels, dtype=np.int64)

train_data, train_labels = build_dataset(data, labels, dict_word2index, max_sentence_len=1000)

In [16]:
def split_data(data, radio=0.7):
    """
    将训练集分给为训练集和检验集
    """
    split_index = int(len(data) * radio)
    new_data1 = data[ : split_index]
    new_data2 = data[split_index : ]
    return new_data1, new_data2
train_X, valid_X = split_data(train_data)
train_y, valid_y = split_data(train_labels)
print(train_X.shape)
print(train_y.shape)


(7000, 1000)
(7000,)


In [99]:
from torch.utils import data

class LawData(data.Dataset):
    def __init__(self,X,y):
        super(LawData,self).__init__()
        self.len = X.shape[0]
        self.x_data = X
        self.y_data = y
    def __getitem__(self, index):
        return self.x_data[index],self.y_data[index]
    def __len__(self):
        return self.len

batch_size = 32
num_workers = 0
dataset = LawData(train_X, train_y)
train_loader = data.DataLoader(dataset=dataset, 
                               batch_size=batch_size, 
                               shuffle=False,
                               num_workers=num_workers)
dataset = LawData(valid_X, valid_y)
valid_loader = data.DataLoader(dataset=dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=num_workers)

In [111]:
from torch import nn
import  tensorflow as tf
#用tensorflow和pytorch俩种方法试着实现
class FastTextpy(nn.Module):
    #这里的config我们可以定一个config结构 去传入
    def __init__(self,vocab_size,embedding_dim,num_classes):
        super(FastTextpy,self).__init__()
        self.embedding =nn.Embedding(vocab_size,embedding_dim)
        self.linear = nn.Linear(embedding_dim,num_classes)
    def forward(self, input):
        self.embeded = self.embedding(input)
        text_embed= torch.mean(self.embeded,dim=1)
        #print(text_embed.size())
        text_embed = text_embed.view(-1, text_embed.size(2))
        logits = self.linear(text_embed)
        return logits

vocab_size = 100000
embedding_size = 128
num_class = 8
fast_text = FastTextpy(vocab_size=vocab_size, embedding_dim=embedding_size,
                    num_classes=num_class)
print(fast_text)

FastTextpy(
  (embedding): Embedding(100000, 128)
  (linear): Linear(in_features=128, out_features=8, bias=True)
)


In [112]:
import torch.optim as optim

learning_rate = 0.001
loss_fun = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=fast_text.parameters(), lr=learning_rate)

In [58]:
#这个损失函数是替代loss_fun = nn.CrossEntropyLoss()
def log_softmax(input):
    return torch.log_softmax(input,dim =1)
def loss_fun(inputs,label):
    loss =0
    for i in range(len(label)):
        loss+=log_softmax(inputs)[i][label[i]]
    loss=loss/len(label)
    return -loss
    

In [113]:
epoch_num =3
for epoch in range(epoch_num):
    running_loss = 0.0
    for i,data in enumerate(train_loader):
        texts,labels =data
        inputs,labels =Variable(texts),Variable(labels)
        optimizer.zero_grad()
        outputs = fast_text(inputs)
        loss = loss_fun(outputs,labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.data
        if i % 100 == 99:
            print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

torch.Size([32, 128])


RuntimeError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [103]:
#建立评分标准f1-score
from collections import Counter
#这是单一一个评分
def f1(predictions,true_labels,label):
    true_pos ,false_neg = 0,0
    false_pos =0
    for i in range(len(true_labels)):
        if predictions[i] ==label:
            if true_labels[i] ==label:
                true_pos+=1
            else:
                false_pos+=1
        else:
            if true_labels[i] ==label:
                false_neg+=1
    if true_pos ==0:
        precision,recall =0,0
    else:
        precision =true_pos/float(true_pos+false_pos)
        recall = true_pos/float(true_pos+false_pos)
    if precision ==0:
        f1 =0
    else:
        f1 =(2*precision*recall)/(precision+recall)
    return f1
def micro_avg_f1(predict_label, true_labels, num_class):
    true_labels = np.array(true_labels)
    count = Counter(true_labels)
    print(count)
    score = 0
    for i in range(num_class):
        score+=(count[i]*f1(predict_label,true_labels,i))
    score =score/float(len(true_labels))
    return score
        



In [107]:
true_labels = []
predicted_labels = []
num_class=8
for data in valid_loader:
    texts, labels = data
    outputs = fast_text(Variable(texts))
    _, predicted = torch.max(outputs.data, 1)
    true_labels.extend(labels)
    #predicted = [i for i in predicted]
    predicted_labels.extend(predicted)

print(true_labels[:10])
print(predicted_labels[:20])
print("Micro-Averaged F1:",micro_avg_f1(predicted_labels, true_labels, num_class))

[tensor(5), tensor(6), tensor(1), tensor(6), tensor(1), tensor(4), tensor(1), tensor(6), tensor(5), tensor(5)]
[tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6), tensor(6)]
Counter({6: 705, 1: 520, 5: 499, 0: 433, 2: 345, 4: 336, 3: 147, 7: 15})
Micro-Averaged F1: 0.055224999999999996


In [86]:
Counter(np.array(true_labels))

Counter({0: 433, 1: 520, 2: 345, 3: 147, 4: 336, 5: 499, 6: 705, 7: 15})

tensor(494.3237, grad_fn=<NegBackward>)

AttributeError: 'numpy.ndarray' object has no attribute 'append'