# a lr_classifier for docs

dataset: http://ai.stanford.edu/~amaas/data/sentiment/  
tarを展開して使用

In [1]:
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {"pos":0, "neg":1}
df = pd.DataFrame()

In [7]:
# load
for s in ["train","test"]:
    for l in ["pos","neg"]:
        path ="./aclImdb/%s/%s" % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),"r",encoding="utf-8") as infile:
                txt = infile.read()
                df = df.append([[txt,labels[l]]],ignore_index=True)
                pbar.update()
df.columns = ["review","sentiment"]

In [10]:
df.head()

Unnamed: 0,review,sentiment
31013,I expected this to be a lot better. I love Tim...,1
46866,I have seen it a few times and get completely ...,0
28326,"I only saw this movie once, and that was enoug...",1
42343,"i am an avid ff7 fan, for instance i have the ...",0
39952,This is my first Deepa Mehta film. I saw the f...,0


In [9]:
# ランダム化
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("./movie_data.csv",index=False)

In [43]:
# BoW
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(["the sun is shining","The weather is sweet", 
                 "the sun is shining, the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)
print(count.vocabulary_)
print(bag.toarray())

{'and': 0, 'sweet': 5, 'two': 7, 'the': 6, 'sun': 4, 'is': 1, 'weather': 8, 'one': 2, 'shining': 3}
[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [21]:
#HTMLタグおよびemorticonの処理
import re
def preprocessor(text):
    text = re.sub("<[^>]*>","",text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)",text)
    text = re.sub("[\W]+", " ", text.lower() + "".join(emoticons).replace("-",""))
    return text

text = df.loc[30,"review"][-500:]
print(text)
print(preprocessor(text))
df["review"] = df["review"].apply(preprocessor)

In [33]:
# tokenize
def tokenizer(text):
    return text.split()

In [44]:
# tokenize with steming
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [45]:
# stopwords
# import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [47]:
# split_dataset
x_train = df.loc[:25000,"review"].values
y_train = df.loc[:25000,"sentiment"].values
x_test  = df.loc[25000:,"review"].values
y_test  = df.loc[25000:,"sentiment"].values

In [None]:
# training
# cross_validate
# param_grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
               "vect__tokenizer":[tokenizer,tokenizer_porter],
               "clf__penalty":["l1","l2"],
               "clf__C":[1.0,10.0,100.0]},
               {"vect__ngram_range":[(1,1)],
                "vect__stop_words":[stop,None],
                "vect__tokenizer":[tokenizer, tokenizer_porter],
                "vect__use_idf":[False],
                "vect__norm":[None],
                "clf__penalty":["l1","l2"],
                "clf__C":[1.0, 10.0, 100.0]}
              ]
lr_tfidf = Pipeline([("vect",tfidf),
                    ("clf",LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                          scoring="accuracy",
                          cv=3,verbose=2,
                          n_jobs=2)
gs_lr_tfidf.fit(x_train, y_train)


In [61]:
gs_lr_tfidf.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'vect__ngram_range': (1, 1),
 'vect__stop_words': None,
 'vect__tokenizer': <function __main__.tokenizer>}

In [64]:
gs_lr_tfidf.best_score_
clf = gs_lr_tfidf.best_estimator_
print("testSet:",clf.score(x_test,y_test))

testSet: 0.911924741052


## 大規模なデータ処理
### オンラインアルゴリズムとアウトオブコア学習

In [70]:
def stream_docs(path):
    with open(path,"r",encoding="utf-8") as csv:
        next(csv)
        for line in csv:
            text,label = line[:-3], int(line[-2])
            yield text,label

In [74]:
stream_docs(path="./movie_data.csv")

<generator object stream_docs at 0x1273abbf8>

In [90]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None,None
    return docs,y

In [96]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error="ignore",n_features=2**21,preprocessor=None,tokenizer=tokenizer)
clf = SGDClassifier(loss="hinge",random_state=1,n_iter=1)
doc_stream = stream_docs(path="./movie_data.csv")

In [97]:
pbar = pyprind.ProgBar(45)
classes = np.asarray([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:09


In [98]:
X_test,y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("clf score:",clf.score(X_test,y_test))

clf score: 0.825


### tips
 - SGDclassifierはマルチクラスに対応している
 - clf.partial_fit(X,y,classes=classes)のとき、classes paramに全てのラベルを入れ込んでおく
 - n_jobs paramで複数CPUに分割できる