### import

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pickle as pkl
import os
import math
import torch
import torch.nn.functional as F
import time
from datetime import timedelta

## data preprocess

### spit evaluate set

In [2]:
df_data = pd.read_csv('./raw_data/train.tsv', sep='\t')
df_data.info()
df_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [14]:
labels = df_data['Sentiment'].unique()
labels = list(labels)
labels
with open(r'./data_set/labels.txt', 'w', encoding='utf-8') as f:
    for label in labels:
        f.write(str(label) + "\n")

[1, 2, 3, 4, 0]

2

2

2

2

2

In [15]:
shuffle_index = np.random.permutation(len(df_data))
shuffled_data = df_data[['Phrase','Sentiment']].iloc[shuffle_index]
shuffled_data.iloc[:int(0.7*len(df_data))].to_csv(r'./data_set/train.txt', sep='\t', encoding='utf-8', header=None, index=False)
shuffled_data.iloc[int(0.7*len(df_data))+1:int(0.9*len(df_data))].to_csv(r'./data_set/dev.txt', sep='\t', encoding='utf-8', header=None, index=False)
shuffled_data.iloc[int(0.9*len(df_data))+1:].to_csv(r'./data_set/test.txt', sep='\t', encoding='utf-8', header=None, index=False)

### build vocabulary（BOW） and dataset

In [2]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

# 获取单词的词性
def get_wordnet_pos(tag):
    '''
    for WordNetLemmatizer.lemmatize()
    '''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def build_vocab(data_path, vocab_path):
    data = [_.strip().split('\t')[0] for _ in open(data_path, 'r', encoding='utf-8').readlines()]
    word_cnt = dict()
    for sentence in tqdm(data):
        tokens = word_tokenize(sentence.lower())              # 分词,同时大写换小写
        tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
        wnl = WordNetLemmatizer()
        for word, tag in tagged_sent:
            lemmatized_word = wnl.lemmatize(word, pos=get_wordnet_pos(tag)) # 还原后的词
            word_cnt[lemmatized_word] = word_cnt.get(lemmatized_word, 0) + 1
    word_cnt = sorted(word_cnt.items(), key=lambda x:x[0], reverse=True)
    print(len(word_cnt))
    vocab = {_[0]: idx for idx, _ in enumerate(word_cnt)}
    vocab.update({UNK: len(vocab), PAD: len(vocab) + 1})
    pkl.dump(vocab, open(vocab_path, 'wb'))
    print("vocab build successed, size : %d" %len(vocab))
    return vocab

In [3]:
def build_dataset(config):
    '''
    变成[([],y),([],y),([],y),([],y)]
    '''
    if os.path.exists(config.vocab_path):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:
        vocab = build_vocab(config.train_path, config.vocab_path)
    print("vocab loaded sucessed, size : %d " %len(vocab))
    def load_data(file_path, output_path):
        if os.path.exists(output_path):
            data = pkl.load(open(output_path, 'rb'))
            print("%s loaded success, size: %d" %(output_path, len(data)))
            return data
        data = open(file_path, 'r', encoding='utf-8').readlines()
        lemmatized_data = list()
        for line in tqdm(data):
            try:
                x, y = line.strip().split('\t')
            except:
                print(line)
            tokens = word_tokenize(x.lower())              # 分词,同时大写换小写
            tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
            wnl = WordNetLemmatizer()
            lemmatized_sentence = [wnl.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tagged_sent]
            lemmatized_data.append((lemmatized_sentence, y))
        pkl.dump(lemmatized_data, open(output_path, 'wb'))
        print("%s loaded success, size: %d" %(file_path, len(lemmatized_data)))
        return lemmatized_data
    test_set = load_data(config.test_path, config.test_set_path)
    dev_set = load_data(config.dev_path, config.dev_set_path)
    train_set = load_data(config.train_path, config.train_set_path)
    return vocab, train_set, dev_set, test_set

## units

### config

In [4]:
class Config(object):
    def __init__(self):
        self.train_path = r'./data_set/train.txt'
        self.dev_path = r'./data_set/dev.txt'
        self.test_path = r'./data_set/test.txt'
        self.vocab_path = r'./data_set/vocab.pkl'
        self.train_set_path = r'./data_set/train.pkl'
        self.dev_set_path = r'./data_set/dev.pkl'
        self.test_set_path = r'./data_set/test.pkl'
        self.class_list = [x.strip() for x in open(r'./data_set/labels.txt', encoding='utf-8').readlines()]
        self.label_num = len(self.class_list)
        self.batch_size = 32
        self.epoch = 20
        self.learning_rate = 1e-3
        self.log_step = 1000

### datasetIterater

In [5]:
class DatasetIterater(object):
    '''
    return the batch index of dataset
    '''
    def __init__(self, data_set, batch_size):
        self.data_set = data_set
        self.batch_size = batch_size
        self.n_batches = len(data_set) // batch_size
        self.residue = True if len(data_set) % self.n_batches != 0 else False
        self.index = 0
        
    def to_couple(self, raw_batch):
        x = [_[0] for _ in raw_batch]
        y = [_[1] for _ in raw_batch]
        return x,y
        
    def __next__(self):
        if self.index == self.n_batches and self.residue:
            raw_batch = self.data_set[self.index * self.batch_size: len(self.data_set)]
            batch = self.to_couple(raw_batch)
            self.index += 1
            return batch
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            raw_batch = self.data_set[self.index * self.batch_size: (self.index+1) * self.batch_size]
            batch = self.to_couple(raw_batch)
            self.index += 1
            return batch
           
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.n_batches+1 if self.residue else self.n_batches

Test of config and dataload 

In [6]:
config = Config()
vocab, train_set, dev_set, test_set = build_dataset(config)
data_iter = DatasetIterater(data_set = train_set, batch_size = 8)
for batch in data_iter:
    if data_iter.index % 1000 == 0:
        data_iter.index
len(data_iter)

vocab loaded sucessed, size : 14670 
./data_set/test.pkl loaded success, size: 15605
./data_set/dev.pkl loaded success, size: 31211
./data_set/train.pkl loaded success, size: 109242


1000

2000

3000

4000

5000

6000

7000

8000

9000

10000

11000

12000

13000

13656

## model

### one hot Embedding

In [7]:
class OneHotEmbedding(object):
    def __init__(self, config):
        self.vocab = pkl.load(open(config.vocab_path, 'rb'))
        self.embedding_shape = (config.batch_size, len(self.vocab))
        self.embedding_length = len(self.vocab)
        
    def batch_vectorize(self, x):
        '''
        batch_data: batch_size * length
        '''
        vec = np.zeros((len(x), len(vocab)), dtype=int)
        for i, tokens in enumerate(x):
            vec[i,[vocab.get(token, vocab['<UNK>']) for token in tokens]] = 1
        return vec

Test of one-hot embedding

In [8]:
vocab, train_set, dev_set, test_set = build_dataset(config)
data_iter = DatasetIterater(data_set = train_set, batch_size = config.batch_size)
x, y  = next(data_iter)
embedding = OneHotEmbedding(config)
vec = embedding.batch_vectorize(x)
vec.shape
vec[0].nonzero()

vocab loaded sucessed, size : 14670 
./data_set/test.pkl loaded success, size: 15605
./data_set/dev.pkl loaded success, size: 31211
./data_set/train.pkl loaded success, size: 109242


(16, 14670)

(array([   56,   227,   269,  1295,  1452,  1543,  1682,  1724,  1941,
         2497,  2751,  3697,  3699,  4093,  4654,  4676,  5516,  5609,
         5719,  5720,  5739,  6241,  6464,  6885,  8714,  9302,  9617,
         9829, 10380, 13401, 13638, 14438, 14637, 14640], dtype=int64),)

### logistic model

$$
logistic = \frac{1}{1+exp(-w^Tx)}
$$

For binary classificaiton:
$$
loss = -\frac{1}{N}\sum^{N}_{n=1}(y^{(n)}log(\hat{y}^{(n)})+(1-y^{(n)})log(1-\hat{y}^{(n)}))
$$
For multi-classification:
$$
loss = -\frac{1}{N}\sum^{N}_{n=1}\sum^{C}_{c=1}(y^{(c)}log(\hat{y}^{(c)})
$$

Back Propagation:
$$
\frac{\partial{R(w)}}{\partial{w}} =
-\frac{1}{N}\sum^{N}_{n=1}x^{(n)}(y^{(n)}-\hat{y}^{(n)})
$$

In [9]:
class LogisticModel(object):
    def __init__(self, embedding, config):
        self.embedding = embedding
        self.weight = np.zeros((config.label_num, self.embedding.embedding_length))  # label_num*14760
        self.bias = np.zeros((1, config.label_num))
        self.learing_rate = config.learning_rate
        
    def forward(self, x):
        '''
        for multi-logistic, argmax adopted
        input: batch * embedding_length
        weight: embedding_length * label_num
        output: batch * label_num
        '''
        one_hot_embedding = self.embedding.batch_vectorize(x)
        o = one_hot_embedding.dot(self.weight.T) + self.bias
        out = 1/(1+np.exp(-o))
        return out
        
    def backward(self, x, y_true, lr):
        '''
        learing_rate
        x: batch*character_length    8*14670
        weight: label*num*character  5*14670
        bias: 1*label_num
        
        y_hat:  batch*label_num      8*5
        y_true: 1*label_num -> batch*label_num
        
        '''
        one_hot_embedding = self.embedding.batch_vectorize(x)
        o = one_hot_embedding.dot(self.weight.T) + self.bias
        y_hat = 1/(1+np.exp(-o))
        
        binary_labels = np.zeros(y_hat.shape)
        for i,label in enumerate(y_true):
            binary_labels[i,int(label)] = 1
        
        ### mini-batch gradient descent
        ex_x = np.expand_dims(one_hot_embedding, axis=2)
        ex_y = np.expand_dims((y_hat - binary_labels), axis=1)
        partial_w = np.einsum('ijk,ikn->ijn', ex_x, ex_y).mean(axis=0)
        partial_b = (y_hat - binary_labels).mean(axis=0)
#         print(ex_x.shape)
#         print(ex_y.shape)
#         print("partial_w shape:", partial_w.shape)
#         print("partial_b shape:", partial_b.shape)
        self.weight = self.weight - lr*partial_w.T
        self.bias = self.bias - lr*partial_b
        return 

In [10]:
def softmax(x, axis):
    return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)   

def cross_entropy_loss(y_hat, y_true, sm=False):
    binary_labels = np.zeros(y_hat.shape)
    for i,label in enumerate(y_true):
        binary_labels[i,int(label)] = 1
    if sm:
        y_hat = softmax(y_hat, axis=1)
    eps = 1e-15
    y_hat = np.where(y_hat > 1-eps, 1-eps, y_hat)
    y_hat = np.where(y_hat < eps, eps, y_hat)
#     corss_entry = binary_labels*np.log(y_hat) + (1-binary_labels)*np.log(1-y_hat) # binary classification
    cross_entropy = binary_labels*np.log(y_hat)
    loss = -cross_entropy.sum()/len(y_true)
    return loss



In [11]:
## Test for logistic model
vocab, train_set, dev_set, test_set = build_dataset(config)
data_iter = DatasetIterater(data_set = train_set, batch_size = 8)
x, y_true = next(data_iter)
embedding = OneHotEmbedding(config)
model = LogisticModel(embedding, config)
out = model.forward(x)
loss = cross_entropy_loss(out, y_true, sm=False)
model.backward(x, y_true, config.learning_rate)

vocab loaded sucessed, size : 14670 
./data_set/test.pkl loaded success, size: 15605
./data_set/dev.pkl loaded success, size: 31211
./data_set/train.pkl loaded success, size: 109242


## train and eval

In [16]:
def get_time_dif(start_time):
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

def train(model, config, train_iter, dev_iter, test_iter):
    start_time = time.time()
    total_iter = 0
    acc_dev = 0
    for epoch in range(config.epoch):
        print('Epoch [{}/{}]'.format(epoch + 1, config.epoch))
        for i, (x, y_true) in enumerate(train_iter):
            out = model.forward(x)
            loss = cross_entropy_loss(out, y_true, sm=False)
            model.backward(x, y_true, config.learning_rate)
            total_iter += 1
            if total_iter % config.log_step == 0 or i+1 == len(train_iter):
                acc_train, loss_train = evalute(model, config, DatasetIterater(data_set = train_set, batch_size = config.batch_size))
                acc_dev, loss_dev = evalute(model, config, DatasetIterater(data_set = dev_set, batch_size = config.batch_size))
                improve = ""
                if acc_dev > best_dev_acc:
                    best_dev_acc = acc_dev
                    improve = "*"
                time_dif = get_time_d
                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}'
                print(msg.format(total_iter, loss_train, acc_train, loss_dev, acc_dev, time_dif, improve))
    acc_test, loss_test = evalute(model, config, test_iter)
    msg = 'Train Loss: {0:>5.2},  Train Acc: {1:>6.2%}'
    print(msg.format(loss_test, acc_test))
    
def evalute(model, config, data_iter):
    loss_total = 0
    labels_predict = np.arange(0)
    labels_true = []
    for (x, y_true) in data_iter:
        out = model.forward(x)
        loss = cross_entropy_loss(out, y_true, sm=False)
        loss_total += loss
        labels_predict = np.concatenate((labels_predict, out.argmax(axis=1)), axis=0)
        labels_true += y_true
    try:
        assert len(labels_true) == labels_predict.size
    except AssertionError as e:
        print(len(x))
        print(out.shape)
        print("labels_true:", len(labels_true))
        print("labels_predict:",labels_predict.size)
        raise AssertionError
    labels_true = np.array([int(_) for _ in labels_true])
    loss = loss_total/len(data_iter)
    acc = (labels_true == labels_predict).mean()
    return acc, loss_total

## run

In [1]:
config = Config()
vocab, train_set, dev_set, test_set = build_dataset(config)
config.batch_size = 32
config.epoch = 50
config.learning_rate = 1e-2
train_iter = DatasetIterater(data_set = train_set, batch_size = config.batch_size)
dev_iter = DatasetIterater(data_set = dev_set, batch_size = config.batch_size)
test_iter = DatasetIterater(data_set = test_set, batch_size = config.batch_size)
embedding = OneHotEmbedding(config)
model = LogisticModel(embedding, config)
train(model, config, train_iter, dev_iter, test_iter)

NameError: name 'Config' is not defined