In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objects as go

import matplotlib
import plotly
import sklearn
import re

from IPython.display import display
from time import time

print("package版本信息：")
print("numpy:      ", np.__version__)
print("pandas:     ", pd.__version__)
print("matplotlib: ", matplotlib.__version__)
print("sklearn:    ", sklearn.__version__)
print("seaborn:    ", sns.__version__)
print("plotly:     ", plotly.__version__)

package版本信息：
numpy:       1.18.1
pandas:      1.0.1
matplotlib:  3.1.3
sklearn:     0.22.1
seaborn:     0.10.0
plotly:      4.14.1


# 前言

文本处理是属于NLP下的一个范畴，涉及面很广，这里只关注其中的两类任务：
1. 文本聚类
2. 文本分类

上述两类任务都需要对文本数据进行处理以提取特征，以下介绍文本处理过程中的一些通用流程和概念。

在文本分析的语境中，数据集通常被称为语料库（corpus），每个由单个文本表示的数据点被称为文档（ document）。

+ 词袋表示法  
舍弃输入文本中的大部分结构，如章节、段落、句子和格式，**只计算语料库中每个单词在每个文本中的出现频次** .  
通常计算步骤分为如下 3 个：
  1. 分词（tokenization）.将每个文档划分为出现在其中的单词（称为词例token），比如按空格和标点划分.
  2. 构建词表（vocabulary building）。收集一个词表，里面包含出现在任意文档中的所有词，并对它们进行编号（比如按字母顺序排序）.
  3. 编码（encoding）。对于每个文档，计算词表中每个单词在该文档中的出现频次.
  
  
  
+ Tf-idf表示法



+ 此外，对于单词处理，特别是分布（Tokenization）这一步，还可以做额外的处理.  
词表中通常同时包含某些单词的单数形式和复数形式，这个问题可以通过**用词干（word stem）表示每个单词来解决**，通常有两种方式：
  1. 基于规则的启发法，通常将其称为**词干提取（stemming）**
  2. 使用由已知单词形式组成的字典（明确的且经过人工验证的系统），通常称为**词形还原（lemmatization）**
  
  
完成上面这些步骤，通常有两个包可以使用：
+ NLTK
+ Spacy

# NLTK使用

NLTK的API结构参考这里 [NLTK Python Module Index](https://www.nltk.org/py-modindex.html).


1. 分词.  
NLTK提供了`nltk.tokenize`这个模块，主要是如下两个分词**函数**（不是类）
  + `nltk.tokenize.sent_tokenize(text, language='english')`  
用于分割句子
  + `nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)`  
用于分割单词


2. 词干提取.  
NLTK提供了`nltk.stem`这个模块.
  + `nltk.stem.porter.PorterStemmer()`类，封装了Porter提取算法
  + `nltk.stem.snowball`模块里提供了非英语类词干的提取方法
  + `nltk.stem.wordnet.WordNetLemmatizer()`类，提供了词形还原的算法

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
mytext = "Hello Adam, how are you? I hope everything is going well. Today is a good day, see you dude."

In [3]:
sent_tokenize(mytext)

['Hello Adam, how are you?',
 'I hope everything is going well.',
 'Today is a good day, see you dude.']

In [4]:
word_tokenize(mytext)

['Hello',
 'Adam',
 ',',
 'how',
 'are',
 'you',
 '?',
 'I',
 'hope',
 'everything',
 'is',
 'going',
 'well',
 '.',
 'Today',
 'is',
 'a',
 'good',
 'day',
 ',',
 'see',
 'you',
 'dude',
 '.']

In [7]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

print(porter_stemmer.stem('working'))
print(porter_stemmer.stem('works'))
print(porter_stemmer.stem('worked'))

work
work
work


In [10]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('increases'))
print(lemmatizer.lemmatize('increased'))
print(lemmatizer.lemmatize('was'))

increase
increased
wa


# spaCy使用

spaCy的使用和NLTK完全不同，NLTK大体上还是面向过程的调用函数方式来完成任务，而spaCy是采用的pipeline方式完成.  

spaCy的使用步骤如下：
1. 载入语言模型.  
每个语言模型里，基于所选的语言，封装了一系列对文件进行处理的Pipeline.
```python
nlp = spacy.load('en_core_web_sm')
```
2. 将需要处理的文本传递给语言模型，生成一个`Doc`对象——它包含了一系列对文本进行处理的步骤，封装成一个Pipeline，其中第一步就是分词
```python
doc = nlp("Text to be process")
```
3. 调用`Doc`对象的各种方法，获取不同的内容.

In [71]:
import spacy
# 首先必须载入一个语言模型
nlp = spacy.load('en_core_web_sm')
nlp.__class__

spacy.lang.en.English

In [72]:
# 检查当前pipeline里的步骤名称
nlp.pipe_names

['tagger', 'parser', 'ner']

In [14]:
# 获取Doc对象
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)
doc.__class__

spacy.tokens.doc.Doc

In [18]:
print("token.text", "token.lemma_")
for token in doc:
    print(token.text,  token.lemma_)

token.text token.lemma_
Apple Apple
is be
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion


---

# Gensim使用

这里使用gensim生成一份词向量。

----

# 文本数据集

这里有 3 份数据集，分别是
1. IMDB电影评分数据集
2. sklearn自带的新闻数据集
3. 《python数据挖掘入门》里文本处理使用的tweet数据集

## IMDB电影评分数据集

## Sklearn自带的数据集

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [7]:
cd ..

D:\Project-Workspace\Python-Projects\DataAnalysis


In [8]:
# 这里指定了数据集下载的文件夹，第一次使用时，会下载相关的数据集到这个文件夹里
data = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='all')
# print(data.__class__)

# 显示新闻数据集里文章有哪些分类
# display(data.target_names)

In [9]:
# 这里只需要部分分类的数据集
categories = [
    'sci.space',
    'rec.sport.baseball',
    'talk.politics.mideast',
    'talk.politics.guns'
]

# 训练集
train = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='train', categories=categories)
# 测试集
test = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='test', categories=categories)

# 查看数据集大小
print(len(train.data), train.target.shape)
print(len(test.data), test.target.shape)

2300 (2300,)
1531 (1531,)


In [10]:
train.data.__class__

list

In [11]:
train.data[0].__class__

str

In [12]:
# 随便看一下里面的一篇文章
train.data[1]

'From: tclock@orion.oac.uci.edu (Tim Clock)\nSubject: Re: Final Solution for Gaza ?\nNntp-Posting-Host: orion.oac.uci.edu\nOrganization: University of California, Irvine\nLines: 66\n\nIn article <1483500354@igc.apc.org> Center for Policy Research <cpr@igc.apc.org> writes:\n>\n>From: Center for Policy Research <cpr>\n>Subject: Final Solution for Gaza ?\n>\n>While Israeli Jews fete the uprising of the Warsaw ghetto,\n\n"fete"??? Since this word both formally and commonly refers to\npositive/joyous events, your misuse of it here is rather unsettling.\n \n>they repress by violent means the uprising of the Gaza ghetto \n>and attempt to starve the Gazans.\n\nI certainly abhor those Israeli policies and attitudes that are\nabusive towards the Palestinians/Gazans. Given that, however, there \n*is no comparison* between the reality of the Warsaw Ghetto and in \nGaza.  \n>\n>The right of the Gazan population to resist occupation is\n>recognized in international law and by any person with a sense

In [15]:
# 看看类别是否均衡
pd.Series(train.target).value_counts()

0    597
1    593
3    564
2    546
dtype: int64

# 朴素贝叶斯文本分类示例

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import spacy
# 首先必须载入一个语言模型
nlp = spacy.load('en_core_web_sm')

In [20]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [90]:
# 先取一部分子集看下效果
part = train.data[:10]

cnt_vec = CountVectorizer()
cnt_vec.fit(part)

part_ = pd.DataFrame(cnt_vec.transform(part).toarray(), columns=cnt_vec.get_feature_names())

In [21]:
# 使用spaCy对文件进行处理
# 这里使用spaCy的词形还原，但是分词器使用的是scikit-learn里提供的正则表达式
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en_core_web_sm')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

# 自定义的分词函数
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

In [99]:
lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5, max_df=0.95)
# lemma_vect.fit(part)
# part_ = pd.DataFrame(cnt_vec.transform(part).toarray(), columns=cnt_vec.get_feature_names())
lemma_vect.fit(train.data)

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.95, max_features=None, min_df=5,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function custom_tokenizer at 0x000001F16CC768B8>,
                vocabulary=None)

In [100]:
train_token = lemma_vect.transform(train.data)
test_token = lemma_vect.transform(test.data)
train_y = train.target
test_y = test.target

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  


In [102]:
train_token.shape

(2300, 8448)

In [104]:
test_token.shape

(1531, 8448)

In [101]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [113]:
berNB = BernoulliNB()
multiNB = MultinomialNB()

berNB.fit(train_token, train_y)
multiNB.fit(train_token, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [114]:
berNB_score = berNB.score(test_token, test_y)
multiNB_score = multiNB.score(test_token, test_y)

In [115]:
berNB_score

0.8883082952318746

In [117]:
multiNB_score

0.9738732854343566

In [118]:
berNB_y_pred = berNB.predict(test_token)
multiNB_y_pred = multiNB.predict(test_token)

In [119]:
from sklearn.metrics import accuracy_score

In [120]:
berNB_pred_score = accuracy_score(y_true=test_y, y_pred=berNB_y_pred)
multiNB_pred_score = accuracy_score(y_true=test_y, y_pred=multiNB_y_pred)

In [121]:
berNB_pred_score

0.8883082952318746

In [122]:
multiNB_pred_score

0.9738732854343566

----

# Word2Vec

In [1]:
import jieba
import re
from gensim.models import word2vec

In [22]:
# 使用spaCy对文件进行处理
import spacy
# 首先必须载入一个语言模型
nlp = spacy.load('en_core_web_sm')
# 这里使用spaCy的词形还原，但是分词器使用的是scikit-learn里提供的正则表达式
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en_core_web_sm')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

# 自定义的分词函数
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

In [26]:
# t = train.data[1]
# t_res = custom_tokenizer(t)

train_proc = [custom_tokenizer(doc) for doc in train.data]

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  if __name__ == '__main__':


In [33]:
# %time
# 使用gensim训练模型
t1 = time()
model = word2vec.Word2Vec(train_proc,iter=50, size=100, window=3, sg=0, workers=4)
t2 = time()

print("runing time is :{:.2f}".format(t2-t1))

runing time is :11.42


In [37]:
# %time
# 使用gensim训练模型
t1 = time()
model = word2vec.Word2Vec(train_proc,iter=50, size=100, window=3, sg=0, workers=8)
t2 = time()

print("runing time is :{:.2f}".format(t2-t1))

runing time is :10.94


In [38]:
model

<gensim.models.word2vec.Word2Vec at 0x1e6237ec688>

In [42]:
model.wv

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1e628df6d08>

In [41]:
'policy' in model.wv

True

In [47]:
model.wv['policy']

array([ 1.46654677e+00, -1.99778438e+00,  1.26974404e+00,  5.50841391e-01,
       -6.28954530e-01,  1.28450203e+00, -7.91259587e-01, -4.42545146e-01,
       -2.63815850e-01,  2.10831928e+00, -1.98553884e+00, -1.15344435e-01,
       -1.62778163e+00,  1.45624906e-01, -3.31313372e-01,  2.18228966e-01,
        1.20822871e+00, -1.04821241e+00, -2.85565615e+00, -2.82280177e-01,
       -1.79006910e+00, -2.54633689e+00,  1.30262002e-01,  1.86022115e+00,
        3.36442888e-01, -6.46334291e-01,  2.42829466e+00, -2.52690744e+00,
        2.36002254e+00,  6.97122872e-01, -4.39212620e-01,  9.54245448e-01,
        2.59631556e-02, -1.55718148e+00, -1.44868529e+00, -1.58921421e+00,
        1.29590106e+00, -4.33103770e-01,  1.45411468e+00,  2.47827798e-01,
       -1.26484489e+00,  1.31447625e+00,  1.01077892e-01, -4.03619200e-01,
        1.58041000e+00,  1.13859832e+00,  9.37864125e-01,  4.93863732e-01,
       -8.31795692e-01,  8.17711473e-01, -9.16759610e-01,  5.99537566e-02,
       -3.73655498e-01, -

In [48]:
model.wv.most_similar(positive='policy')

[('entity', 0.505190372467041),
 ('terrorism', 0.48177653551101685),
 ('regime', 0.45322495698928833),
 ('action', 0.4518165588378906),
 ('goverment', 0.4459748864173889),
 ('gazan', 0.4422627091407776),
 ('government', 0.4344480633735657),
 ('existance', 0.429462730884552),
 ('philosophy', 0.42827075719833374),
 ('Nazis', 0.4210732877254486)]

------

以下是别人的代码

In [2]:
#读取停用词
stop_words = []
with open("data\stop_words.txt", "r", encoding="utf-8") as f_reader:
    for line in f_reader:
        line = line.replace("\r","").replace("\n","").strip()
        stop_words.append(line)
print(len(stop_words))
stop_words = set(stop_words)
print(len(stop_words))

1893
1892


In [3]:
#文本预处理
sentecnces = []
rules = u"[\u4e00-\u9fa5]+"
pattern = re.compile(rules)
f_writer = open("data\分词后的天龙八部.txt", "w", encoding="utf-8")

with open("data\天龙八部.txt", "r" , encoding="utf-8") as f_reader:
    for line in f_reader:
        line = line.replace("\r","").replace("\n","").strip()
        if line == "" or line is None:
            continue
        line = " ".join(jieba.cut(line))
        seg_list = pattern.findall(line)
        word_list = []
        for word in seg_list:
            if word not in stop_words:
                word_list.append(word)
        if len(word_list) > 0:
            sentecnces.append(word_list)
            line = " ".join(word_list)
            f_writer.write(line + "\n")
            f_writer.flush()
f_writer.close()

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\bruce\AppData\Local\Temp\jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built succesfully.


In [4]:
print(sentecnces[:10])

[['书名', '天龙八部'], ['作者', '金庸'], ['本文', '早安', '电子书', '网友', '分享', '版权', '原作者'], ['用于', '商业行为', '后果自负'], ['早安', '电子书'], ['金庸', '作品集', '三联', '版', '序'], ['小学', '时', '爱读', '课外书', '低年级', '时看', '儿童', '画报', '小朋友', '小学生', '内容', '小朋友', '文库', '似懂非懂', '阅读', '各种各样', '章回小说', '五六年', '级', '时', '看新', '文艺作品', '喜爱', '古典文学', '作品', '多于', '近代', '当代', '新文学', '个性', '使然', '朋友', '喜欢', '新文学', '不爱', '古典文学'], ['知识', '当代', '书报', '中', '寻求', '小学', '时代', '得益', '记忆', '最深', '爸爸', '哥哥', '购置', '邹韬奋', '所撰', '萍踪', '寄语', '萍踪', '忆语', '世界各地', '旅行', '记', '主编', '生活', '周报', '新', '旧', '童年时代', '深受', '邹先生', '生活', '书店', '之惠', '生活', '书店', '三联书店', '组成部分', '十多年', '前', '香港三联书店', '签', '合同', '中国', '大陆', '地区', '出版', '小说', '因事', '未果', '重', '行', '筹划', '三联书店', '独家', '出版', '中国', '大陆', '地区', '简体字', '感到', '欣慰', '回忆', '昔日', '心中', '充满', '温馨', '之意'], ['撰写', '这套', '总数', '三十六', '册', '作品集', '是从', '一九五五年', '七二年', '约', '十三', '四年', '包括', '十二部', '长篇小说', '两篇', '中篇小说', '一篇', '短篇小说', '一篇', '历史', '人物', '评传', '若干篇', '历史', '考据', '文字', '出版', '过程', '奇怪', '香港', '台湾',

In [25]:
#模型训练
model = word2vec.Word2Vec(sentecnces,iter=50, size=100, window=3, sg=0)

In [26]:
#选出10个与乔峰最相近的10个词
for e in model.most_similar(positive=["乔峰"],topn=10):
    print(e[0],e[1])

白世镜 0.46209681034088135
宋长老 0.4431273341178894
徐长老 0.40213707089424133
玄慈 0.3699125647544861
乔大爷 0.3584541380405426
鲍千灵 0.3493095338344574
谭公 0.3465454578399658
努儿海 0.3400297462940216
马夫人 0.33992263674736023
陈长老 0.3371778130531311


In [32]:
#加载预料
sentences2 = word2vec.Text8Corpus("data\分词后的天龙八部.txt")

In [33]:
print(sentences2)

<gensim.models.word2vec.Text8Corpus object at 0x000002203F199DA0>


In [68]:
#训练模型
model = word2vec.Word2Vec(sentences2)

In [69]:
#选出10个与乔峰最相近的10个词
for e in model.most_similar(positive=["乔峰"],topn=10):
    print(e[0],e[1])

阿紫道 0.996263861656189
姊夫 0.9961894750595093
冷冷的 0.9957616329193115
找 0.995625376701355
两位 0.9950875639915466
赵钱孙 0.9950235486030579
全冠清 0.9950124025344849
乔帮主 0.9949069023132324
姊姊 0.9948595762252808
叹 0.9947936534881592


In [70]:
#保存模型
model.save("data/天龙八部.model")

In [71]:
#加载模型
model2 = word2vec.Word2Vec.load("data/天龙八部.model")

In [72]:
#选出10个与乔峰最相近的10个词
for e in model2.most_similar(positive=["乔峰"],topn=10):
    print(e[0],e[1])

阿紫道 0.996263861656189
姊夫 0.9961894750595093
冷冷的 0.9957616329193115
找 0.995625376701355
两位 0.9950875639915466
赵钱孙 0.9950235486030579
全冠清 0.9950124025344849
乔帮主 0.9949069023132324
姊姊 0.9948595762252808
叹 0.9947936534881592


In [73]:
#计算两个词语的相似度
sim_value = model.similarity('乔峰','萧峰')
print(sim_value)

0.983976449658


In [74]:
#计算两个集合的相似度
list1 = ['乔峰','萧远山']
list2= ['慕容复','慕容博']
sim_value = model.n_similarity(list1,list2)
print(sim_value)

0.988961361533


In [80]:
#选出集合中不同类型的词语
list3 = ['段誉','阿紫','王语嫣','丁春秋']
print(model.doesnt_match(list3))

丁春秋


In [81]:
#查看词向量值
print(type(model['乔峰']))

<class 'numpy.ndarray'>


In [82]:
print(len(model['乔峰']))

100


In [83]:
print(model['乔峰'])

[-0.0956201   0.83770102 -0.50264329  0.17705031 -0.03397365  0.53726691
 -0.94604319 -0.62564951 -0.16479155 -0.56092721 -0.05749961  0.19491424
  0.23317778  0.69680882 -0.01325463  0.49103716 -0.2274354  -0.27408433
  0.30029318  0.30257019  0.38569745 -0.24391353 -0.20851924 -1.03797174
  0.1151455  -0.12836897 -0.25457156 -0.21057677 -0.59543329 -0.41599837
 -0.13542509  0.23070075 -0.0045849  -0.12233413  0.35966769  0.30875343
 -0.55712724 -0.05722067 -0.20944791  0.12286058 -0.49387416  0.26767832
 -0.03382351 -0.09347658  0.6634863   0.25116423 -0.17000762 -0.41160905
  0.26862589 -0.48492855 -0.2278533   0.45218593 -0.15033394  0.17578831
 -0.58941668 -0.0460459   0.04037135  0.27327055 -0.22823675  0.2203647
  0.05365429 -0.04600301 -0.7688188   0.13671628 -0.53027683 -0.1469509
  1.03760362  0.12982909 -0.22781004  0.54549789  0.05604199 -0.42564315
 -0.06961453 -0.16930862 -0.73733568 -0.40141526  0.83430684 -0.35497716
  0.45108098  0.59830898  0.56303334  0.40351576  0.0

----

# BERT