## import

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## data preprocess

### spit evaluate set

In [9]:
df_data = pd.read_csv('./raw_data/train.tsv', sep='\t')
df_data.info()
df_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [55]:
shuffle_index = np.random.permutation(len(df_data))
shuffled_data = df_data[['Phrase','Sentiment']].iloc[shuffle_index]
shuffled_data.iloc[:int(0.7*len(df_data))].to_csv(r'./data_set/train.txt', sep='\t', encoding='utf-8', header=None)
shuffled_data.iloc[int(0.7*len(df_data))+1:int(0.9*len(df_data))].to_csv(r'./data_set/dev.txt', sep='\t', encoding='utf-8', header=None)
shuffled_data.iloc[int(0.9*len(df_data))+1:].to_csv(r'./data_set/test.txt', sep='\t', encoding='utf-8', header=None)

### build vocabulary

In [25]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 获取单词的词性
def get_wordnet_pos(tag):
    '''
    for WordNetLemmatizer.lemmatize()
    '''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.'

word_cnt = dict() # 同时统计一下词频
for sentence in tqdm(all_data['Phrase']):
    tokens = word_tokenize(sentence.lower())              # 分词,同时大写换小写
    tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
    wnl = WordNetLemmatizer()
#     lemmas_sent = []
    for word, tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_word = wnl.lemmatize(word, pos=wordnet_pos) # 还原后的词
        if lemmatized_word in word_cnt:
            word_cnt[lemmatized_word] += 1
        else:
            word_cnt[lemmatized_word] = 1
len(word_cnt)

  0%|          | 0/156060 [00:00<?, ?it/s]

14755

In [32]:
len(word_cnt)
with open(r'./data_set/vocabulary.txt', 'w', encoding='utf-8') as f:
    for word in word_cnt.keys():
        word_num = f.write(word+'\n')

14755

### limit vocab size version(process unknown word)

## data loader

In [59]:
vocab_path = r'./data_set/vocabulary.txt'
vocab = open(vocab_path).readlines()

def get_one_hot_vector(sent, vocab):
    sent = all_data['Phrase'][10005] # test
    tokens = word_tokenize(sent.lower())
    tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
    lemmatized_words = list()
    for word, tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_words.append(wnl.lemmatize(word, pos=wordnet_pos))
    ind = np.zeros(len(vocab), dtype=int)
    for word in lemmatized_words:
        ind[vocab.index(word)] = 1 # Raises ValueError if the value is not present. 
    return ind


config

In [None]:
class config(object):
    def __init__(self, vocab_path):
        self.vocab = vocab_path
        self.train_path = r'./data_set/train.txt'
        self.train_path = r'./data_set/dev.txt'
        self.train_path = r'./data_set/test.txt'

Embedding

In [None]:
class OneHotEmbedding(object):
    def __init__(self, config):
        vocab = open(config.vocab_path).readlines()
        
        
    def batch_vectorize(self,x):
        '''
        '''
        def get_one_hot_vector(self, sent, vocab):
            sent = all_data['Phrase'][10005] # test
            tokens = word_tokenize(sent.lower())
            tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
            lemmatized_words = list()
            for word, tag in tagged_sent:
                wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
                lemmatized_words.append(wnl.lemmatize(word, pos=wordnet_pos))
            ind = np.zeros(len(vocab), dtype=int)
            for word in lemmatized_words:
                ind[vocab.index(word)] = 1 # Raises ValueError if the value is not present. 
            return ind

DatasetIterater

In [14]:
class DatasetIterater(object):
    '''
    return the batch index of dataset
    '''
    def __init__(self, data_set, batch_size):
        self.data_set = data_set
        self.batch_size = batch_size
        self.n_batches = len(data_set) // batch_size
#         self.residue = False  # 记录batch数量是否为整数
#         if len(batches) % self.n_batches != 0:
#             self.residue = True
        self.residue = True if len(data_set) % self.n_batches != 0 else False
        self.index = 0
        
    def to_couple(self, data_slice):
        x = list()
        x = [_[0] for _ in data_slice]
        x = [_[0] for _ in data_slice]
        return 
        
    def __next__(self):
        if self.index == self.n_batches and self.residue:
            batch = self.data_set[self.index * self.batch_size: len(data_set)]
            self.index += 1
            return batch
        elif self.index >= self.n_batches:
            self.index = 0
            return StopIteration
        else:
            batch = self.data_set[self.index * self.batch_size: (self.index+1) * self.batch_size]
            self.index += 1
            return batch
           
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.n_batches+1 if self.residue else self.n_batches

In [26]:
data_iter = DatasetIterater(data_set = df_data[['Phrase', 'Sentiment']], batch_size = 4)
df_test = next(data_iter)

In [30]:
list(df_test['Phrase'])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'A series of escapades demonstrating the adage that what is good for the goose',
 'A series',
 'A']

In [21]:
len(data_iter)

39015

In [58]:
z = np.zeros(10, int)
z[[2,1,3]] = 1
z

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0])

## model

## train and eval