In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## data preprocess

In [2]:
all_data = pd.read_csv('./data_set/train.tsv', sep='\t')
all_data.info()
all_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


### build word vocabulary

In [25]:
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 获取单词的词性
def get_wordnet_pos(tag):
    '''
    for WordNetLemmatizer.lemmatize()
    '''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

# sentence = 'football is a family of team sports that involve, to varying degrees, kicking a ball to score a goal.'

word_cnt = dict() # 同时统计一下词频
for sentence in tqdm(all_data['Phrase']):
    tokens = word_tokenize(sentence.lower())              # 分词,同时大写换小写
    tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
    wnl = WordNetLemmatizer()
#     lemmas_sent = []
    for word, tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_word = wnl.lemmatize(word, pos=wordnet_pos) # 还原后的词
        if lemmatized_word in word_cnt:
            word_cnt[lemmatized_word] += 1
        else:
            word_cnt[lemmatized_word] = 1
len(word_cnt)

  0%|          | 0/156060 [00:00<?, ?it/s]

14755

In [32]:
len(word_cnt)
with open(r'./output/vocabulary.txt', 'w', encoding='utf-8') as f:
    for word in word_cnt.keys():
        word_num = f.write(word+'\n')

14755

### word vecotorize

In [59]:
vocab = list(word_cnt.keys())

def get_one_hot_vector(sent, vocab_size):
    sent = all_data['Phrase'][10005]
    tokens = word_tokenize(sent.lower())
    tagged_sent = pos_tag(tokens, tagset='universal')     # 词性标注
    lemmatized_words = list()
    for word, tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag) or wordnet.NOUN
        lemmatized_words.append(wnl.lemmatize(word, pos=wordnet_pos))
    ind = np.zeros(vocab_size, dtype=int)
    for word in lemmatized_words:
        ind[vocab.index(word)] = 1 # Raises ValueError if the value is not present. 
    return ind

In [58]:
z = np.zeros(10, int)
z[[2,1,3]] = 1
z

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 0])