In [3]:
import numpy as np
import torch
import json


In [4]:
unk = '<UNK>'

In [5]:
def load_data(train_data, val_data, test_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)
    with open(test_data) as test_f:
        test = json.load(test_f)

    tra = []
    val = []
    tes = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in test:
        tes.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val, tes

In [6]:
# Returns: 
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab 

In [7]:
def make_indices(vocab):
    vocab_list = sorted(vocab)  # vocab a-z 排序
    vocab_list.append(unk)      # 结尾添加unk
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index        # word2index[word] = index
        index2word[index] = word        # index2word[index] = word
    vocab.add(unk)
    return vocab, word2index, index2word 


# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index)) 
        for word in document:
            index = word2index.get(word, word2index[unk])   # 查找该单词的索引。如果单词在 word2index 中不存在，则返回 <UNK> 的索引。
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data


In [8]:
print("========== Loading data ==========")
train_data, valid_data, test_data = load_data("./training.json", "./validation.json", "./test.json") # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
vocab = make_vocab(train_data)      # 返回不重复的vocabuaries 集合
vocab, word2index, index2word = make_indices(vocab) # vocab 排序好a-z, <UNK>; word2index[word] => index 0, 1, 2...; index2word[index] => word "<UNK>", "hello"...




In [9]:
len(vocab)

65667

In [10]:
len(train_data)

8000

In [11]:
len(valid_data)

800

In [12]:
len(test_data)

800

In [13]:
print("========== Vectorizing data ==========")
train_data = convert_to_vector_representation(train_data, word2index)
valid_data = convert_to_vector_representation(valid_data, word2index)
test_data = convert_to_vector_representation(test_data, word2index)



In [17]:
train_data[0][0].shape

torch.Size([65667])

In [18]:
valid_data[0][0].shape

torch.Size([65667])

In [19]:
test_data[0][0].shape

torch.Size([65667])