In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objects as go

import matplotlib
import plotly
import sklearn
import re

from IPython.display import display

print("package版本信息：")
print("numpy:      ", np.__version__)
print("pandas:     ", pd.__version__)
print("matplotlib: ", matplotlib.__version__)
print("sklearn:    ", sklearn.__version__)
print("seaborn:    ", sns.__version__)
print("plotly:     ", plotly.__version__)

package版本信息：
numpy:       1.18.1
pandas:      1.0.1
matplotlib:  3.1.3
sklearn:     0.22.1
seaborn:     0.10.0
plotly:      4.14.1


# 前言

文本处理是属于NLP下的一个范畴，涉及面很广，这里只关注其中的两类任务：
1. 文本聚类
2. 文本分类

上述两类任务都需要对文本数据进行处理以提取特征，以下介绍文本处理过程中的一些通用流程和概念。

在文本分析的语境中，数据集通常被称为语料库（corpus），每个由单个文本表示的数据点被称为文档（ document）。

+ 词袋表示法  
舍弃输入文本中的大部分结构，如章节、段落、句子和格式，**只计算语料库中每个单词在每个文本中的出现频次** .  
通常计算步骤分为如下 3 个：
  1. 分词（tokenization）.将每个文档划分为出现在其中的单词（称为词例token），比如按空格和标点划分.
  2. 构建词表（vocabulary building）。收集一个词表，里面包含出现在任意文档中的所有词，并对它们进行编号（比如按字母顺序排序）.
  3. 编码（encoding）。对于每个文档，计算词表中每个单词在该文档中的出现频次.
  
  
  
+ Tf-idf表示法



+ 此外，对于单词处理，特别是分布（Tokenization）这一步，还可以做额外的处理.  
词表中通常同时包含某些单词的单数形式和复数形式，这个问题可以通过**用词干（word stem）表示每个单词来解决**，通常有两种方式：
  1. 基于规则的启发法，通常将其称为**词干提取（stemming）**
  2. 使用由已知单词形式组成的字典（明确的且经过人工验证的系统），通常称为**词形还原（lemmatization）**
  
  
完成上面这些步骤，通常有两个包可以使用：
+ NLTK
+ Spacy

# NLTK使用

NLTK的API结构参考这里 [NLTK Python Module Index](https://www.nltk.org/py-modindex.html).


1. 分词.  
NLTK提供了`nltk.tokenize`这个模块，主要是如下两个分词**函数**（不是类）
  + `nltk.tokenize.sent_tokenize(text, language='english')`  
用于分割句子
  + `nltk.tokenize.word_tokenize(text, language='english', preserve_line=False)`  
用于分割单词


2. 词干提取.  
NLTK提供了`nltk.stem`这个模块.
  + `nltk.stem.porter.PorterStemmer()`类，封装了Porter提取算法
  + `nltk.stem.snowball`模块里提供了非英语类词干的提取方法
  + `nltk.stem.wordnet.WordNetLemmatizer()`类，提供了词形还原的算法

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
mytext = "Hello Adam, how are you? I hope everything is going well. Today is a good day, see you dude."

In [3]:
sent_tokenize(mytext)

['Hello Adam, how are you?',
 'I hope everything is going well.',
 'Today is a good day, see you dude.']

In [4]:
word_tokenize(mytext)

['Hello',
 'Adam',
 ',',
 'how',
 'are',
 'you',
 '?',
 'I',
 'hope',
 'everything',
 'is',
 'going',
 'well',
 '.',
 'Today',
 'is',
 'a',
 'good',
 'day',
 ',',
 'see',
 'you',
 'dude',
 '.']

In [7]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

print(porter_stemmer.stem('working'))
print(porter_stemmer.stem('works'))
print(porter_stemmer.stem('worked'))

work
work
work


In [10]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('increases'))
print(lemmatizer.lemmatize('increased'))
print(lemmatizer.lemmatize('was'))

increase
increased
wa


# spaCy使用

spaCy的使用和NLTK完全不同，NLTK大体上还是面向过程的调用函数方式来完成任务，而spaCy是采用的pipeline方式完成.  

spaCy的使用步骤如下：
1. 载入语言模型.  
每个语言模型里，基于所选的语言，封装了一系列对文件进行处理的Pipeline.
```python
nlp = spacy.load('en_core_web_sm')
```
2. 将需要处理的文本传递给语言模型，生成一个`Doc`对象——它包含了一系列对文本进行处理的步骤，封装成一个Pipeline，其中第一步就是分词
```python
doc = nlp("Text to be process")
```
3. 调用`Doc`对象的各种方法，获取不同的内容.

In [71]:
import spacy
# 首先必须载入一个语言模型
nlp = spacy.load('en_core_web_sm')
nlp.__class__

spacy.lang.en.English

In [72]:
# 检查当前pipeline里的步骤名称
nlp.pipe_names

['tagger', 'parser', 'ner']

In [14]:
# 获取Doc对象
text = "Apple is looking at buying U.K. startup for $1 billion"
doc = nlp(text)
doc.__class__

spacy.tokens.doc.Doc

In [18]:
print("token.text", "token.lemma_")
for token in doc:
    print(token.text,  token.lemma_)

token.text token.lemma_
Apple Apple
is be
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion


----

# 文本数据集

这里有 3 份数据集，分别是
1. IMDB电影评分数据集
2. sklearn自带的新闻数据集
3. 《python数据挖掘入门》里文本处理使用的tweet数据集

## IMDB电影评分数据集

## sklearn自带的数据集

In [31]:
from sklearn.datasets import fetch_20newsgroups

In [32]:
# 这里指定了数据集下载的文件夹，第一次使用时，会下载相关的数据集到这个文件夹里
data = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='all')
# print(data.__class__)

# 显示新闻数据集里文章有哪些分类
# display(data.target_names)

In [33]:
# 这里只需要部分分类的数据集
categories = [
    'sci.space',
    'rec.sport.baseball',
    'talk.politics.mideast',
    'talk.politics.guns'
]

# 训练集
train = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='train', categories=categories)
# 测试集
test = fetch_20newsgroups(data_home='datasets/sklearn-news-data/', subset='test', categories=categories)

# 查看数据集大小
print(len(train.data), train.target.shape)
print(len(test.data), test.target.shape)

2300 (2300,)
1531 (1531,)


In [36]:
train.data.__class__

list

In [37]:
train.data[0].__class__

str

In [25]:
# 随便看一下里面的一篇文章
train.data[1]

'From: tclock@orion.oac.uci.edu (Tim Clock)\nSubject: Re: Final Solution for Gaza ?\nNntp-Posting-Host: orion.oac.uci.edu\nOrganization: University of California, Irvine\nLines: 66\n\nIn article <1483500354@igc.apc.org> Center for Policy Research <cpr@igc.apc.org> writes:\n>\n>From: Center for Policy Research <cpr>\n>Subject: Final Solution for Gaza ?\n>\n>While Israeli Jews fete the uprising of the Warsaw ghetto,\n\n"fete"??? Since this word both formally and commonly refers to\npositive/joyous events, your misuse of it here is rather unsettling.\n \n>they repress by violent means the uprising of the Gaza ghetto \n>and attempt to starve the Gazans.\n\nI certainly abhor those Israeli policies and attitudes that are\nabusive towards the Palestinians/Gazans. Given that, however, there \n*is no comparison* between the reality of the Warsaw Ghetto and in \nGaza.  \n>\n>The right of the Gazan population to resist occupation is\n>recognized in international law and by any person with a sense

In [34]:
# 看看类别是否均衡
pd.Series(train.target).value_counts()

0    597
1    593
3    564
2    546
dtype: int64

# 朴素贝叶斯文本分类示例

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [89]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [90]:
# 先取一部分子集看下效果
part = train.data[:10]

cnt_vec = CountVectorizer()
cnt_vec.fit(part)

part_ = pd.DataFrame(cnt_vec.transform(part).toarray(), columns=cnt_vec.get_feature_names())

In [93]:
# 使用spaCy对文件进行处理
# 这里使用spaCy的词形还原，但是分词器使用的是scikit-learn里提供的正则表达式
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en_core_web_sm')
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

# 自定义的分词函数
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

In [99]:
lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5, max_df=0.95)
# lemma_vect.fit(part)
# part_ = pd.DataFrame(cnt_vec.transform(part).toarray(), columns=cnt_vec.get_feature_names())
lemma_vect.fit(train.data)

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.95, max_features=None, min_df=5,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function custom_tokenizer at 0x000001F16CC768B8>,
                vocabulary=None)

In [100]:
train_token = lemma_vect.transform(train.data)
test_token = lemma_vect.transform(test.data)
train_y = train.target
test_y = test.target

from spacy.tokens import Doc
doc = Doc(nlp.vocab, words=[...])
  


In [102]:
train_token.shape

(2300, 8448)

In [104]:
test_token.shape

(1531, 8448)

In [101]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

In [113]:
berNB = BernoulliNB()
multiNB = MultinomialNB()

berNB.fit(train_token, train_y)
multiNB.fit(train_token, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [114]:
berNB_score = berNB.score(test_token, test_y)
multiNB_score = multiNB.score(test_token, test_y)

In [115]:
berNB_score

0.8883082952318746

In [117]:
multiNB_score

0.9738732854343566

In [118]:
berNB_y_pred = berNB.predict(test_token)
multiNB_y_pred = multiNB.predict(test_token)

In [119]:
from sklearn.metrics import accuracy_score

In [120]:
berNB_pred_score = accuracy_score(y_true=test_y, y_pred=berNB_y_pred)
multiNB_pred_score = accuracy_score(y_true=test_y, y_pred=multiNB_y_pred)

In [121]:
berNB_pred_score

0.8883082952318746

In [122]:
multiNB_pred_score

0.9738732854343566