In [22]:
import pyprind
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #建立词袋模型
from sklearn.feature_extraction.text import TfidfTransformer #把tf转成tf-idf

## 获取IMDb电影评论数据集
- 首先初始化一个包含50000（文档的数量）次迭代的进度条对象pbar
- 对数据集进行随机处理
- 最后把处理过得数据存储为CSV文件

In [11]:
#使用pyprind的预测剩余处理时间
pbar = pyprind.ProgBar(50000)
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = 'data/aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
        
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:18


In [13]:
df.shape

(50000, 2)

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,The Best Years of Our Life is often compared t...,1
1,This comic classic of English school girl anti...,1
2,I would like to know if anyone know how I can ...,1
3,I was five when the show made its debut in 195...,1
4,I saw this movie many years ago and it has nev...,1


In [15]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('data/movie_data.csv', index=False)

In [16]:
df = pd.read_csv('data/movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,A funny and scathing critique of Russian socie...,1
1,Making a film for under 1 Million might be a t...,0
2,One of the lamer wedding movies you'll see. Sm...,0


In [18]:
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
#fit是创建词袋模型，transform转换成为稀疏的特征向量
bag = count.fit_transform(docs)

In [19]:
print(count.vocabulary_) #输出相关词汇的内容，将单个单词映射为一个整数索引

{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}


In [20]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [23]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


解读:单词is在第三个文档中只得到了一个相对较小的tf-idf(0.48)，这是由于第一和第二个文档中都包含单词is，因此它不太可能包含有用或是有辨识度的信息。