# 处理文本数据

`
创建于20230313 创建者Stephen CUI
`

In [48]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np

## 用字符串表示的数据类型

## 示例应用：电影评论的情感分析

In [3]:
reviews_train = load_files('Data/train/')
text_train, y_train = reviews_train.data, reviews_train.target
print('type of text_train: {}'.format(type(text_train)))
print('length of text_train: {}'.format(len(text_train)))
print('text_train[1]: \n {}'.format(text_train[1]))

type of text_train: <class 'list'>
length of text_train: 25000
text_train[1]: 
 b'Words can\'t describe how bad this movie is. I can\'t explain it by writing only. You have too see it for yourself to get at grip of how horrible a movie really can be. Not that I recommend you to do that. There are so many clich\xc3\xa9s, mistakes (and all other negative things you can imagine) here that will just make you cry. To start with the technical first, there are a LOT of mistakes regarding the airplane. I won\'t list them here, but just mention the coloring of the plane. They didn\'t even manage to show an airliner in the colors of a fictional airline, but instead used a 747 painted in the original Boeing livery. Very bad. The plot is stupid and has been done many times before, only much, much better. There are so many ridiculous moments here that i lost count of it really early. Also, I was on the bad guys\' side all the time in the movie, because the good guys were so stupid. "Executive Decis

In [5]:
text_train = [doc.replace(b'<br />', b' ') for doc in text_train]
print('Samples per class (training): {}'.format(np.bincount(y_train)))

Samples per class (training): [12500 12500]


In [11]:
reviews_test = load_files('Data/test/')
text_test, y_test = reviews_test.data, reviews_test.target
text_test = [doc.replace(b'<br />', b' ') for doc in text_test]
print('Samples per class (test): {}'.format(np.bincount(y_test)))
print('length of text test: {}'.format(len(text_test)))

Samples per class (test): [12500 12500]
length of text test: 25000


## 将文本数据表示为词袋

### 将词袋应用于测试数据集

In [7]:
bards_words =["The fool doth think he is wise,", 
              "but the wise man knows himself to be a fool"]

In [8]:
vect = CountVectorizer()
vect.fit(bards_words)

In [9]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 13
Vocabulary content:
 {'the': 9, 'fool': 3, 'doth': 2, 'think': 10, 'he': 4, 'is': 6, 'wise': 12, 'but': 1, 'man': 8, 'knows': 7, 'himself': 5, 'to': 11, 'be': 0}


In [14]:
bag_of_words = vect.transform(bards_words)
print('bag_of_words: {}'.format(repr(bag_of_words)))

bag_of_words: <2x13 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>


In [13]:
print('Dense representation of bag_of_words:\n{}'.format(
    bag_of_words.toarray()
))

Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


### 将词袋应用于电影评论

In [15]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
X_train

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3431196 stored elements in Compressed Sparse Row format>

In [24]:
feature_names = vect.get_feature_names_out()
print('Number of features: {}'.format(len(feature_names)))
print('First 20 features: \n{}'.format(feature_names[:20]))
print('Features 20010 to 20030: \n{}'.format(feature_names[20010: 20030]))
print('Every 2000th feature: \n{}'.format(feature_names[::2000]))

Number of features: 74849
First 20 features: 
['00' '000' '0000000000001' '00001' '00015' '000s' '001' '003830' '006'
 '007' '0079' '0080' '0083' '0093638' '00am' '00pm' '00s' '01' '01pm' '02']
Features 20010 to 20030: 
['dratted' 'draub' 'draught' 'draughts' 'draughtswoman' 'draw' 'drawback'
 'drawbacks' 'drawer' 'drawers' 'drawing' 'drawings' 'drawl' 'drawled'
 'drawling' 'drawn' 'draws' 'draza' 'dre' 'drea']
Every 2000th feature: 
['00' 'aesir' 'aquarian' 'barking' 'blustering' 'bête' 'chicanery'
 'condensing' 'cunning' 'detox' 'draper' 'enshrined' 'favorit' 'freezer'
 'goldman' 'hasan' 'huitieme' 'intelligible' 'kantrowitz' 'lawful' 'maars'
 'megalunged' 'mostey' 'norrland' 'padilla' 'pincher' 'promisingly'
 'receptionist' 'rivals' 'schnaas' 'shunning' 'sparse' 'subset'
 'temptations' 'treatises' 'unproven' 'walkman' 'xylophonist']


In [26]:
scores = cross_val_score(LogisticRegression(max_iter=1_000), X_train, y_train)
round(np.mean(scores), 2)

In [38]:
param_grid = {'C': [.001, .01, .1, 1, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=2_000), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [39]:
print('Best cross-validation score: {:.2f}'.format(grid.best_score_))
print('Best parameters: {}'.format(grid.best_params_))

Best cross-validation score: 0.89
Best parameters: {'C': 0.1}


In [40]:
X_test = vect.transform(text_test)
grid.score(X_test, y_test)

0.87888

In [41]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print('X_train with min_df: {}'.format(repr(X_train)))

X_train with min_df: <25000x27271 sparse matrix of type '<class 'numpy.int64'>'
	with 3354014 stored elements in Compressed Sparse Row format>


In [42]:
feature_names = vect.get_feature_names_out()
print('First 50 features: \n{}'.format(feature_names[:50]))
print('Features 20010 to 20030: \n{}'.format(feature_names[20010: 20030]))
print('Every 700th feature: \n{}'.format(feature_names[::700]))

First 50 features: 
['00' '000' '007' '00s' '01' '02' '03' '04' '05' '06' '07' '08' '09' '10'
 '100' '1000' '100th' '101' '102' '103' '104' '105' '107' '108' '10s'
 '10th' '11' '110' '112' '116' '117' '11th' '12' '120' '12th' '13' '135'
 '13th' '14' '140' '14th' '15' '150' '15th' '16' '160' '1600' '16mm' '16s'
 '16th']
Features 20010 to 20030: 
['repentance' 'repercussions' 'repertoire' 'repetition' 'repetitions'
 'repetitious' 'repetitive' 'rephrase' 'replace' 'replaced' 'replacement'
 'replaces' 'replacing' 'replay' 'replayable' 'replayed' 'replaying'
 'replays' 'replete' 'replica']
Every 700th feature: 
['00' 'affections' 'appropriately' 'barbra' 'blurbs' 'butchered' 'cheese'
 'commitment' 'courts' 'deconstructed' 'disgraceful' 'dvds' 'eschews'
 'fell' 'freezer' 'goriest' 'hauser' 'hungary' 'insinuate' 'juggle'
 'leering' 'maelstrom' 'messiah' 'music' 'occasional' 'parking'
 'pleasantville' 'pronunciation' 'recipient' 'reviews' 'sas' 'shea'
 'sneers' 'steiger' 'swastika' 'thrusting'

In [46]:
grid = GridSearchCV(LogisticRegression(max_iter=1_000), param_grid=param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

In [47]:
print('Best score-validation score: {:.3f}'.format(grid.best_score_))

Best score-validation score: 0.888


## 停用词

In [49]:
len(ENGLISH_STOP_WORDS)

318

In [52]:
list(ENGLISH_STOP_WORDS)[::10]

['etc',
 'their',
 'found',
 'were',
 'back',
 'go',
 'ever',
 'well',
 'you',
 'beyond',
 'its',
 'here',
 'latterly',
 'least',
 'whereby',
 'she',
 'beside',
 'among',
 'wherever',
 'mine',
 'hundred',
 'twelve',
 'across',
 'in',
 'anyone',
 'or',
 'ourselves',
 'down',
 'however',
 'please',
 'give',
 'am']

In [53]:
vect = CountVectorizer(min_df=5, stop_words='english').fit(text_train)
X_train = vect.transform(text_train)
print('X_train with stop words:\n{}'.format(repr(X_train)))

X_train with stop words:
<25000x26966 sparse matrix of type '<class 'numpy.int64'>'
	with 2149958 stored elements in Compressed Sparse Row format>


In [55]:
grid = GridSearchCV(LogisticRegression(max_iter=1_000), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
print('Best cross-validation score: {:.2f}'.format(grid.best_score_))

Best cross-validation score: 0.88


使用停用词后的网格搜索性能略有下降——不至于担心，但鉴于从 27 000 多个特征中删
除 305 个不太可能对性能或可解释性造成很大影响，所以使用这个列表似乎是不值得的。
固定的列表主要对小型数据集很有帮助，这些数据集可能没有包含足够的信息，模型从数
据本身无法判断出哪些单词是停用词。

作为练习，你可以尝试另一种方法，即通过设置
`CountVectorizer` 的 `max_df` 选项来舍弃出现最频繁的单词，并查看它对特征数量和性能有
什么影响。

In [62]:
vect_practice = CountVectorizer(min_df=5, stop_words='english', max_df=50).fit(text_train)
X_train_practice = vect_practice.transform(text_train)
print('X_train with stop words:\n{}'.format(repr(X_train_practice)))
grid_practice = GridSearchCV(LogisticRegression(max_iter=1_000), param_grid, cv=5, n_jobs=-1)
grid_practice.fit(X_train_practice, y_train)
print('Best cross-validation score: {:.2f}'.format(grid_practice.best_score_))

X_train with stop words:
<25000x20879 sparse matrix of type '<class 'numpy.int64'>'
	with 324302 stored elements in Compressed Sparse Row format>
Best cross-validation score: 0.75


## 用tf-idf缩放数据

## 研究模型系数

## 多个单词的词袋（n元分词）

## 高级分词、词干提取与词形还原

## 主题建模与文档聚类

### 隐含狄利克雷分布