In [1]:
import pyprind
import pandas as pd
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #建立词袋模型
from sklearn.feature_extraction.text import TfidfTransformer #把tf转成tf-idf
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from nltk.stem.porter import PorterStemmer #词干提取
import nltk
from nltk.corpus import stopwords

## 获取IMDb电影评论数据集
- 首先初始化一个包含50000（文档的数量）次迭代的进度条对象pbar
- 对数据集进行随机处理
- 最后把处理过得数据存储为CSV文件

In [2]:
#使用pyprind的预测剩余处理时间
pbar = pyprind.ProgBar(50000)
labels = {'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = 'data/aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
        
df.columns = ['review','sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:04:40


In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,The Best Years of Our Life is often compared t...,1
1,This comic classic of English school girl anti...,1
2,I would like to know if anyone know how I can ...,1
3,I was five when the show made its debut in 195...,1
4,I saw this movie many years ago and it has nev...,1


In [5]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('data/movie_data.csv', index=False)

In [6]:
df = pd.read_csv('data/movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,A funny and scathing critique of Russian socie...,1
1,Making a film for under 1 Million might be a t...,0
2,One of the lamer wedding movies you'll see. Sm...,0


In [7]:
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
#fit是创建词袋模型，transform转换成为稀疏的特征向量
bag = count.fit_transform(docs)

In [8]:
print(count.vocabulary_) #输出相关词汇的内容，将单个单词映射为一个整数索引

{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}


In [9]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [10]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


解读:单词is在第三个文档中只得到了一个相对较小的tf-idf(0.48)，这是由于第一和第二个文档中都包含单词is，因此它不太可能包含有用或是有辨识度的信息。

## 清洗文本数据

In [11]:
#经过重排后，数据集中第一个文档的最后50个字符
df.loc[0,'review'][-50:]

'ents since the fall of the USSR should begin here.'

**输出中包含HTML标记、标点符号以及其他非字母字符。这些字符并未包含很多有用的语义。但是有时标点符号可以在某些NLP语境中提供有用及附加信息。但是在这个例子中，我们将去除标点符号，保留标签符号，如":)"**

In [12]:
def preprocessor(text):
    text = re.sub('<[^>]*>','',text) #移除电影评论中所有的HTML标记
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) #匹配表情符号，并保存下来
    text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
    return text

**后面会把保存下来的表情符号，追加到一个文档字符串后面，因为我们当前使用的是一元组模型，所有词序并不重要。**

In [13]:
preprocessor(df.loc[0,'review'][-50:])

'ents since the fall of the ussr should begin here '

In [14]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

In [15]:
#使用preprocessor函数移除DataFrame中所有的电影评论信息
df['review'] = df['review'].apply(preprocessor)

## 标记文档

In [16]:
def tokenizer(text):
    return text.split()

In [17]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [18]:
#进行词干提取
porter = PorterStemmer()
def  tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

[u'runner', 'like', u'run', 'and', u'thu', 'they', 'run']

词干提取会把单词回复到其原始形式

In [19]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parallels/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', u'like', u'run', u'run', 'lot']

## 训练用于文档分类的逻辑斯蒂回归模型
训练一个逻辑斯蒂回归模型以将电影评论划分为正面评价和负面评价

In [21]:
X_train = df.loc[:25000,'review'].values
y_train = df.loc[:25000,'sentiment'].values
X_test = df.loc[25000:,'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [22]:
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer, tokenizer_porter],
              'clf__penalty':['l1','l2'],
              'clf__C':[1.0,10.0,100.0]},
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words':[stop,None],
               'vect__tokenizer':[tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty':['l1','l2'],
               'clf__C':[1.0,10.0,100.0]}
             ]

In [23]:
lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])

In [None]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',cv=5,verbose=1,n_jobs=-1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 60.8min
Exception in thread Thread-4:
Traceback (most recent call last):
  File "/home/parallels/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
    self.run()
  File "/home/parallels/anaconda2/lib/python2.7/threading.py", line 754, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/home/parallels/anaconda2/lib/python2.7/multiprocessing/pool.py", line 326, in _handle_workers
    pool._maintain_pool()
  File "/home/parallels/anaconda2/lib/python2.7/multiprocessing/pool.py", line 230, in _maintain_pool
    self._repopulate_pool()
  File "/home/parallels/anaconda2/lib/python2.7/multiprocessing/pool.py", line 223, in _repopulate_pool
    w.start()
  File "/home/parallels/anaconda2/lib/python2.7/multiprocessing/process.py", line 130, in start
    self._popen = Popen(self)
  File "/home/parallels/anaconda2/lib/python2.7/multiprocessing/forking.py", line 121, in __init__
    self.pid = os.fork()
OSError:

网格搜索返回的最佳参数设置集合为:使用不含有停用词的常规标记生成器，同时在逻辑斯蒂回归中使用tf-idf，其中逻辑斯蒂回归分类器使用L2正则化，正则化强度C=10.0

In [None]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

## 使用大数据——在线算法与外存学习
前面使用网格搜索进行调参时，计算成本太高。花费时间太长。但是，可以通过外存学习的技术来提高性能，应对大数据。

In [None]:
#定义一个生成器函数：stream_docs，每次读取且返回一个文档的内容
def stream_docs(path):
    with open(path,'r') as csv:
        next(csv)
        for line in csv:
            text, label=line[:-3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path='./movie_data.csv'))

In [None]:
#定义一个get_minibatch函数，以stream_doc函数得到的文档数据流作为输入，并通过参数size返回指定数量的文档内容
def get_minibatch(doc_stream, size):
    docs ,y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs,y

In [None]:
#MurmurHash3算法https://sites.google.com/site/murmurhash/,处理文本信息的向量处理器HashingVectorizer
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [None]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train,y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar().update()

In [None]:
X_test,y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

In [None]:
dest = os.path.join('moviecclassifier','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
pickle.dump(stop,open(os.path.join(dest,'stopwords.pkl'),'wb'),protocol=4)
pickle.dump(clf,open(os.path.join(dest, 'classifier.pkl'),'wb'),protocal=4)