# 将机器学习模型嵌入网络应用

## 序列化拟合scikit-learn评估器

一种模型持久化的解决方案是使用python内置的pickle模块，使用该模块可以通过序列化和反序列化，将python对象结构压缩为字节码，以存储分类器的当前状态，而且当需要再次加载分类器来分类新样本时，没有必要再把需要的模型在所有训练数据上重新训练一遍。

执行前需确保已训练了核心逻辑回归模型，并在当前python会话中已做好准备。

In [5]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>','',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+',' ',text.lower()) \
            + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r',encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3],int(line[-2])
            yield text,label

next(stream_docs(path='movie_data.csv'))
def get_minibatch(doc_stream,size):
    docs,y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log',random_state=1,max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

X_test, y_test = get_minibatch(doc_stream,size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test,y_test))

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\lewisbase/nltk_data'
    - 'D:\\ProgramFile\\Anaconda\\nltk_data'
    - 'D:\\ProgramFile\\Anaconda\\share\\nltk_data'
    - 'D:\\ProgramFile\\Anaconda\\lib\\nltk_data'
    - 'C:\\Users\\lewisbase\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [4]:
import pickle
import os
dest = os.path.join('movieclassifier','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
pickle.dump(stop,
            open(os.path.join(dest,'stopwords.pkl'),'wb'),
            protocol=4)
pickle.dump(clf,
            open(os.path.join(dest,'classifier.pkl'),'wb'),
            protocol=4)

NameError: name 'stop' is not defined

NameError: name 'X_topics' is not defined