## fastText训练词向量

### train_unsupervised方法参数

    input             # 训练文件路径(必选参数)
    model             #  无监督 fasttext 模型 {cbow, skipgram} [skipgram]
    lr                # 学习率 [0.05]
    dim               # 词向量维度 [100]
    ws                # 上下文窗口大小 [5]
    epoch             # 训练轮数 [5]
    minCount           # 词出现的最小次数 [5]
    minn              # char ngram的最小长度 [3]
    maxn              # char ngram的最大长度 [6]
    neg               # 负采样的数量 [5]
    wordNgrams          # word ngram的最大长度 [1]
    loss              # 损失函数 {ns, hs, softmax, ova} [ns](与文本分类softmax不同)
    bucket            # buckets数量 [2000000]
    thread            # threads数量 [number of cpus]
    lrUpdateRate        # 学习率更新的速率 [100]
    t                # 采样阈值 [0.0001]
    verbose           # verbose [2]

### 准备数据

In [3]:
import jieba
import pandas as pd
import random

cate_dic = {'technology':1, 'car':2, 'entertainment':3, 'military':4, 'sports':5}

df_technology = pd.read_csv("./raw_data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv("./raw_data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv("./raw_data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv("./raw_data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv("./raw_data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()

technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]

In [4]:
stopwords=pd.read_csv("raw_data/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
stopwords=stopwords['stopword'].values

In [6]:
def preprocess_text(content_lines, sentences):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append(" ".join(list(segs)))
        except Exception as e:
            print (line)
            continue

#生成无监督训练数据
sentences = []

preprocess_text(technology, sentences)
preprocess_text(car, sentences)
preprocess_text(entertainment, sentences)
preprocess_text(military, sentences)
preprocess_text(sports, sentences)

print ("writing data to fasttext unsupervised learning format...")
out = open('data\\unsupervised_train_data.txt', 'w')
for sentence in sentences:
    out.write((sentence.encode('utf8')+b"\n").decode('utf-8'))
print ("done!")          

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\蓝沛辉\AppData\Local\Temp\jieba.cache
Loading model cost 0.669 seconds.
Prefix dict has been built succesfully.


writing data to fasttext unsupervised learning format...
done!


### 训练

In [9]:
import fasttext

In [10]:
data = 'data\\unsupervised_train_data.txt'

In [11]:
# Skipgram model :
model_1 = fasttext.train_unsupervised(data, model='skipgram') # data.txt是utf-8编码文本

In [None]:
# Cbow model :
model_2 = fasttext.train_unsupervised(data, model='cbow')

In [None]:
print(model.words)   # 词典中的词表

In [12]:
model_1['赛季'] # '赛季' 的向量表示

array([-4.4787051e-03,  1.2972057e-03, -5.0250739e-03,  2.9631550e-03,
       -1.2483088e-03,  2.9456383e-04, -3.9305104e-04,  3.3398918e-03,
       -6.3427626e-03,  2.5978622e-03,  1.8252918e-04, -1.8181484e-03,
        5.6679072e-03, -2.6607672e-03,  4.0851950e-04, -3.4037714e-03,
        5.8991308e-03, -6.0255881e-03, -3.3152278e-03,  2.0057876e-03,
        3.5899791e-03,  5.8529209e-03, -2.5443512e-04,  8.3455059e-04,
       -3.6558888e-03, -3.8529690e-03, -1.3230797e-03,  3.0187019e-03,
        1.9300142e-03, -7.2127627e-03,  1.7569985e-03,  4.9687820e-03,
       -9.4492771e-06, -2.4825823e-03,  3.4017721e-04, -4.3410761e-03,
       -3.0617947e-03, -2.2544619e-03, -2.1202508e-03, -3.5221984e-03,
       -2.3038359e-03,  5.5500842e-03,  3.4701516e-04,  6.2678903e-03,
        4.2757262e-03,  6.1755865e-03,  2.1385388e-03, -3.4973975e-03,
        1.3128852e-03,  4.5425440e-03,  9.6920725e-05, -2.7969887e-03,
       -9.5862150e-04,  1.1946324e-03,  1.5304051e-04, -1.8393546e-03,
      

### 保存模型

In [13]:
# 保存并加载模型对象
model_1.save_model("model\\model_skipgram.bin")

### 加载模型

In [None]:
model = fasttext.load_model("model\\model_filename.bin")