In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# bag of wordsモデルをCountVectrizerクラスで作る
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [2]:
# 出てきた単語をdictでリスト化する
print(count.vocabulary_)

{'sun': 4, 'and': 0, 'is': 1, 'sweet': 5, 'two': 7, 'shining': 3, 'one': 2, 'weather': 8, 'the': 6}


In [3]:
# 上のdictに対応したインデックスでボキャブラリーの出てきた数を表示
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
np.set_printoptions(precision=2)
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)
# TF: ある文書dの中に出現する索引語tの頻度
# DF: 文書全体でその単語が何回現れたか
# TF-IDFとはTF/DFで表す(IDFはDFの対数)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.    0.56  0.56  0.    0.43  0.    0.  ]
 [ 0.    0.43  0.    0.    0.    0.56  0.43  0.    0.56]
 [ 0.5   0.45  0.5   0.19  0.19  0.19  0.3   0.25  0.19]]


In [5]:
# 上の単語isのTF-IDFを手動計算する
# TF: ある文書dの中に出現する索引語tの頻度
# DF: 文書全体でその単語が何回現れたか
# TF-IDFとはTF/DFで表す(IDFはDFの対数)
# IDF = log(1+文書の数)/(1+文書全体でその単語が現れた数)
tf_is = 3
n_docs = 3
# +1してるのは分母による0割を防ぐため
idf_is = np.log((n_docs+1) / (3+1))
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)

tf-idf of term "is" = 3.00


In [6]:
# 前回のTF-IDFはL2正則化してるから値が上の例と異なる
# L2正則化を外すと上の手動計算と値が一致する
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf

array([ 3.39,  3.  ,  3.39,  1.29,  1.29,  1.29,  2.  ,  1.69,  1.29])

In [7]:
# L2正則化は下の計算が行われている
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf

array([ 0.5 ,  0.45,  0.5 ,  0.19,  0.19,  0.19,  0.3 ,  0.25,  0.19])

In [8]:
import pandas as pd
df = pd.read_csv('./movie_data.csv')
df.loc[0, 'review'][-50:]

"lease stick it out to the end. It's well worth it!"

In [9]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [10]:
preprocessor(df.loc[0, 'review'][-50:])

'lease stick it out to the end it s well worth it '

In [11]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [12]:
# すべてのドキュメントにクレンジングを適用する
df['review'] = df['review'].apply(preprocessor)

In [13]:
# ワードステミング(単語を原型に変換)する
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

# 普通に文章を分かち書きする
def tokenizer(text):
    return text.split()

# ワードステミングしながら分かち書きする
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [14]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [15]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [16]:
# nltkライブラリが提供するストップワード辞書をダウンロード
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# ストップワードを除去しながらワードステミングと分かち書き
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV


stop = stopwords.words('english')
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]
# ロジスティック回帰を分類器に使う
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])
# 5分割交差検証
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

In [19]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [20]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 178.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 224.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'clf__C': [1.0, 10.0, 100.0], 'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x7fb39f4de840>, <function tokenizer_porter at 0x7fb39f4de8c8>], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll"...n't", 'won', "won't", 'wouldn', "wouldn't"], None], 'vect__norm': [None], 'vect__use_idf': [False]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [22]:
# ベストなパラメータとスコア
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 10.0, 'vect__ngram_range': (1, 1), 'clf__penalty': 'l2', 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer_porter at 0x7fb39f4de8c8>} 
CV Accuracy: 0.894


In [23]:
# 最後にテストデータも学習させてあげる
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.893
