In [2]:
# 古登堡计划：世界上第一个数字图书馆

In [3]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, threshold=50)

In [4]:
# 加载傲慢与偏见
with open('../data/4/jane-austen/1342-0.txt', encoding='utf8') as f:
    text = f.read()
    lines = text.split('\n')
    line = lines[200]
line

'“Impossible, Mr. Bennet, impossible, when I am not acquainted with him'

In [5]:
# one-hot编码
letter_t = torch.zeros(len(line), 128) # 设定one-hot编码的长度为128
letter_t.shape

torch.Size([70, 128])

In [7]:
# 处理没法表示的字符
for i, letter in enumerate(line.lower().strip()):
    letter_index = ord(letter) if ord(letter) < 128 else 0
    letter_t[i][letter_index] = 1

In [9]:
# 处理停用词，即常见的无意义的词（特殊字符）并全部转换为小写
def clean_words(input_str):
    punctuation = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~'
    word_list = input_str.lower().replace('\n', ' ').split()
    word_list = [word.strip(punctuation) for word in word_list]
    return word_list

words_in_line = clean_words(line)
line, words_in_line

('“Impossible, Mr. Bennet, impossible, when I am not acquainted with him',
 ['“impossible',
  'mr',
  'bennet',
  'impossible',
  'when',
  'i',
  'am',
  'not',
  'acquainted',
  'with',
  'him'])

In [12]:
# 清理整篇并对单词进行排序
word_list = sorted(set(clean_words(text)))
# 把清理好的文本存成一个字典，字典存的是单词和单词所在的位置
word2index_dict = {word: i for (i, word) in enumerate(word_list)}
# 查询impossible的位置
len(word2index_dict), word2index_dict['impossible']

(8434, 3788)

In [13]:
word_t = torch.zeros(len(words_in_line), len(word2index_dict))

In [15]:
for i, word in enumerate(words_in_line):
    word_index = word2index_dict[word]
    word_t[i][word_index] = 1
    print('{:2} {:4}{}'.format(i, word_index, word))
print(word_t.shape)

 0 8274“impossible
 1 4860mr
 2  856bennet
 3 3788impossible
 4 7968when
 5 3700i
 6  415am
 7 5007not
 8  217acquainted
 9 8045with
10 3580him
torch.Size([11, 8434])
