In [1]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [2]:
next(stream_docs(path='./movie_data.csv'))

('"I absolutely adore this movie! I had never heard of it when I saw it at the video store. I saw Kathy Bates was in it, so I figured it had to have some worth, you know? I watched it the first time just shaking my head . . . huh? Then it was the last scene and I found myself aching from smiling so hard. I clicked ""play movie"" and watched the whole thing again. It is without doubt the quirkiest movie I\'ve ever seen. But the more I watch it, the more I love it. It\'s absurd and crazy and sweet and dear. Kathy Bates is impeccable, but the rest of the cast is fabulous, too. What odd characters they all are! The midget is just too funny for words. And Julie Andrews and Barry Manilow are hysterical. It\'s just an all around funny, fabulous movie. I get cravings to see it again. Whoever is watching it for the first time, please stick it out to the end. It\'s well worth it!"',
 1)

In [3]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [4]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [5]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [6]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:05


In [7]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.879


In [9]:
# 最後はテストデータも学習してあげる
clf = clf.partial_fit(X_test, y_test)