做word2vec训练的时候，应该选用“长文章”这样的语料，因为文章的上下文相关性是比较大的。如果采用“评论”这样的语料，语义重复太高，且强关联的vocabulary长度短。

In [1]:
import torch
import torch.nn as nn

import string

import re
import nltk
from nltk.tokenize import word_tokenize

import collections

import pandas as pd

# data

In [2]:
file = open("news.txt", "r")
raw_text = []
for line in file:
    raw_text.append(line.strip())
file.close()
# 为了减少运算量少取几行
raw_text = [raw_text[i] for i in range(5)]

## data prerpocessing

In [3]:
def dePunctuation(line):
    line = line.translate(str.maketrans('', '', string.punctuation))
    return line

def remove_stopwords(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text


def remove_lowFrequency(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in low_freq_vocab]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def lowFrequency(text, threshold = 0):
    # 处理corpus
    tokens = [line.split() for line in text]
    corpus = [token.lower() for sub_tokens in tokens for token in sub_tokens]
    cha_count = collections.Counter(corpus)
    cha_count_modified = {key: value / len(corpus) for key, value in cha_count.items()}
    low_freq_vocab = [word for word, count in cha_count_modified.items() if count < threshold]
    return low_freq_vocab

def clean(text, type='word', stop = True):
    cleaned_text = []
    if type == 'word':
        for line in text:
            line = dePunctuation(line)
            if stop:
                line = remove_stopwords(line)
            line = remove_lowFrequency(line)
            cleaned_text.append(line)
    elif type == 'char':
        cleaned_text = [dePunctuation(line) for line in text]
        cleaned_text = [token for line in cleaned_text for token in line]
    cleaned_text = list(filter(lambda x: x != '', cleaned_text))
    return cleaned_text

In [5]:
def context_center(clean_text, window_size=2):
    flatten_text = ' '.join(clean_text).split()
    data = []
    for i, center in enumerate(flatten_text):
        if (i-window_size>=0) and (i+window_size<len(flatten_text)):
            context = [flatten_text[j] for j in range(i-window_size, i+window_size) 
                   if (j!=i)]
            data.append((context, center))

    return data

# model

## continues bag of words

In [4]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [6]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size=128):
        super(CBOW, self).__init__()

        # 嵌入层的权重矩阵形状，随机初始化
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 全连接层的……，激活函数不改变维度
        self.linear = nn.Linear(embedding_dim, hidden_size)
        self.activation_function = nn.ReLU()

        
        # 输出层
        self.linear_out = nn.Linear(hidden_size, vocab_size)
        # dim=-1 表示对输入数据最后一维进行 softmax 变换
        self.softmax_function = nn.LogSoftmax(dim = -1)
        
    def forward(self, inputs):
        # 将context每个单词embedded相加，得到的结果转化为行向量。-1表示自动计算列数
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        
        hidden = self.linear(embeds)
        hidden = self.activation_function(hidden)
        
        out = self.linear_out(hidden)
        out = self.softmax_function(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

### train

data

In [7]:
window_size = 4

low_freq_vocab = lowFrequency(raw_text, threshold = 0.0001)
clean_text = clean(raw_text, type = 'word', stop = False)

corpus = [line.split() for line in clean_text]
corpus = [word for line in corpus for word in line]
vocab = set(corpus)

word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}
data = context_center(clean_text, window_size)

model

In [8]:
# params
EMDEDDING_DIM = 50
vocab_size = len(vocab)
hidden_size = 64

In [9]:
model = CBOW(vocab_size, EMDEDDING_DIM, hidden_size)
# 负对数似然损失（Negative Log Likelihood Loss）
loss_function = nn.NLLLoss()
# 使用随机梯度下降
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [10]:
#TRAINING
for epoch in range(50):
    total_loss = 0

    for context, center in data:
        # 获得输入words的index
        context_vector = make_context_vector(context, word_to_ix)
        # 获得predict的概率分布
        log_probs = model(context_vector)
        # 损失函数
        total_loss += loss_function(log_probs, torch.tensor([word_to_ix[center]]))

    # 参数更新
    # 在进行反向传播和更新参数之前，需要先清除之前计算得到的梯度值
    optimizer.zero_grad()
    # 计算损失函数关于模型参数的梯度，并将其存储在各自参数张量的 .grad 属性中
    total_loss.backward()
    # 优化器会根据梯度更新模型的参数值
    optimizer.step()

### test

In [11]:
test_context = raw_text[1]
test_tokens = raw_text[1].split()[5:5+window_size]

In [12]:
#TESTING
context_vector = make_context_vector(test_tokens, word_to_ix)
a = model(context_vector)

#Print result
print(f'Raw text: \n{test_context}\n')
print(f'Context: \n{test_tokens}\n')
print(f'Prediction: \n{ix_to_word[torch.argmax(a[0]).item()]}')

Raw text: 
The cryptocurrency and digital asset industry has once again been the focus of the worldwide media. This time, it is not about the promises of an inclusive future of finance but is related to a couple of court cases initiated or found to have come to a close in the past months.

Context: 
['industry', 'has', 'once', 'again']

Prediction: 
been


### word embedding

查看某一个词的词向量表示，比vocabulary size的长度小了很多(=embedding size)

In [13]:
word_embedding = model.get_word_emdedding('developments')
word_embedding.shape

torch.Size([1, 50])