## import

In [88]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pickle as pkl
import os

## data preprocess

### spit evaluate set

In [16]:
df_data = pd.read_csv('./raw_data/train.tsv', sep='\t')
df_data.info()
df_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [17]:
shuffle_index = np.random.permutation(len(df_data))
shuffled_data = df_data[['Phrase','Sentiment']].iloc[shuffle_index]
shuffled_data.iloc[:int(0.7*len(df_data))].to_csv(r'./data_set/train.txt', sep='\t', encoding='utf-8', header=None, index=False)
shuffled_data.iloc[int(0.7*len(df_data))+1:int(0.9*len(df_data))].to_csv(r'./data_set/dev.txt', sep='\t', encoding='utf-8', header=None, index=False)
shuffled_data.iloc[int(0.9*len(df_data))+1:].to_csv(r'./data_set/test.txt', sep='\t', encoding='utf-8', header=None, index=False)

### build vocabulary and dataset

In [224]:
a= {'b':2, 'a':3, 'c':1}
a.keys()
sorted(a.items(), key=lambda x:x[0], reverse=True)

dict_keys(['b', 'a', 'c'])

[('c', 1), ('b', 2), ('a', 3)]

In [225]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
UNK, PAD = '<UNK>', '<PAD>'  # 未知字，padding符号

# 获取单词的词性
def get_wordnet_pos(tag):
    '''
    for WordNetLemmatizer.lemmatize()
    '''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def build_vocab(data_path, vocab_path):
    data = [_.strip().split('\t')[0] for _ in open(data_path, 'r', encoding='utf-8').readlines()]
    word_cnt = dict()
    for sentence in tqdm(data):
        tokens = word_tokenize(sentence.lower())              # 分词,同时大写换小写
        tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
        wnl = WordNetLemmatizer()
        for word, tag in tagged_sent:
            lemmatized_word = wnl.lemmatize(word, pos=get_wordnet_pos(tag)) # 还原后的词
            word_cnt[lemmatized_word] = word_cnt.get(lemmatized_word, 0) + 1
    word_cnt = sorted(word_cnt.items(), key=lambda x:x[0], reverse=True)
    print(len(word_cnt))
    vocab = {_[0]: idx for idx, _ in enumerate(word_cnt)}
    vocab.update({UNK: len(vocab), PAD: len(vocab) + 1})
    pkl.dump(vocab, open(vocab_path, 'wb'))
    print("vocab build successed, size : %d" %len(vocab))
    return vocab

def build_dataset(config):
    '''
    变成[([],y),([],y),([],y),([],y)]
    '''
    if os.path.exists(config.vocab_path):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
    else:
        vocab = build_vocab(config.train_path, config.vocab_path)
    print("vocab loaded sucessed, size : %d " %len(vocab))
    def load_data(file_path, output_path):
        if os.path.exists(output_path):
            data = pkl.load(open(output_path, 'rb'))
            print("%s loaded success, size: %d" %(output_path, len(data)))
            return data
        data = open(file_path, 'r', encoding='utf-8').readlines()
        lemmatized_data = list()
        for line in tqdm(data):
            try:
                x, y = line.strip().split('\t')
            except:
                print(line)
            tokens = word_tokenize(x.lower())              # 分词,同时大写换小写
            tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
            wnl = WordNetLemmatizer()
#             lemmatized_sentence = list()
#             for word, tag in tagged_sent:
#                 lemmatized_word = wnl.lemmatize(word, pos=get_wordnet_pos(tag)) # 还原后的词
#                 lemmatized_sentence.append(lemmatized_word)
            lemmatized_sentence = [wnl.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in tagged_sent]
            lemmatized_data.append((lemmatized_sentence, y))
        pkl.dump(lemmatized_data, open(output_path, 'wb'))
        print("%s loaded success, size: %d" %(file_path, len(lemmatized_data)))
        return lemmatized_data
    test_set = load_data(config.test_path, config.test_set_path)
    dev_set = load_data(config.dev_path, config.dev_set_path)
    train_set = load_data(config.train_path, config.train_set_path)
    return vocab, train_set, dev_set, test_set

In [226]:
config = Config()
vocab, train_set, dev_set, test_set = build_dataset(config)

  0%|          | 0/109242 [00:00<?, ?it/s]

14668
vocab build successed, size : 14670
vocab loaded sucessed, size : 14670 
./data_set/test.pkl loaded success, size: 15605
./data_set/dev.pkl loaded success, size: 31211
./data_set/train.pkl loaded success, size: 109242


## data loader

config

In [137]:
class Config(object):
    def __init__(self):
        self.train_path = r'./data_set/train.txt'
        self.dev_path = r'./data_set/dev.txt'
        self.test_path = r'./data_set/test.txt'
        self.vocab_path = r'./data_set/vocab.pkl'
        self.train_set_path = r'./data_set/train.pkl'
        self.dev_set_path = r'./data_set/dev.pkl'
        self.test_set_path = r'./data_set/test.pkl'

DatasetIterater

In [117]:
class DatasetIterater(object):
    '''
    return the batch index of dataset
    '''
    def __init__(self, data_set, batch_size):
        self.data_set = data_set
        self.batch_size = batch_size
        self.n_batches = len(data_set) // batch_size
        self.residue = True if len(data_set) % self.n_batches != 0 else False
        self.index = 0
        
    def to_couple(self, raw_batch):
        x = [_[0] for _ in raw_batch]
        y = [_[1] for _ in raw_batch]
        return x,y
        
    def __next__(self):
        if self.index == self.n_batches and self.residue:
            raw_batch = self.data_set[self.index * self.batch_size: len(data_set)]
            batch = self.to_couple(raw_batch)
            self.index += 1
            return batch
        elif self.index >= self.n_batches:
            self.index = 0
            return StopIteration
        else:
            raw_batch = self.data_set[self.index * self.batch_size: (self.index+1) * self.batch_size]
            batch = self.to_couple(raw_batch)
            self.index += 1
            return batch
           
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.n_batches+1 if self.residue else self.n_batches

Embedding

In [213]:
class OneHotEmbedding(object):
    def __init__(self, config):
        vocab = pkl.load(open(config.vocab_path, 'rb'))
        
    def batch_vectorize(self, batches):
        '''
        data: batch * length 
        '''
        x, y = batches[0], batches[1]
        vec = np.zeros((len(x), len(vocab)), dtype=int)
        for i, tokens in enumerate(x):
            vec[i,[vocab.get(token, vocab['<UNK>']) for token in tokens]] = 1 # Raises ValueError if the value is not present. 
        return vec

In [227]:
vocab, train_set, dev_set, test_set = build_dataset(config)
data_iter = DatasetIterater(data_set = train_set, batch_size = 4)
iter_test = next(data_iter)
embedding = OneHotEmbedding(config)

vocab loaded sucessed, size : 14670 
./data_set/test.pkl loaded success, size: 15605
./data_set/dev.pkl loaded success, size: 31211
./data_set/train.pkl loaded success, size: 109242


In [228]:
vec = embedding.batch_vectorize(iter_test)
vec.shape

(4, 14670)

In [229]:
vec[0].nonzero()

(array([   56,   227,   269,  1295,  1452,  1543,  1682,  1724,  1941,
         2497,  2751,  3697,  3699,  4093,  4654,  4676,  5516,  5609,
         5719,  5720,  5739,  6241,  6464,  6885,  8714,  9302,  9617,
         9829, 10380, 13401, 13638, 14438, 14637, 14640], dtype=int64),)

## model

## train and eval

## run