In [1]:
from IPython.display import Image
from IPython.core.display import HTML 

import warnings
warnings.filterwarnings('ignore')

## [DSO] Machine Learning Seminar Vol.7
2020-08-xx  
SKUE

In [2]:
Image(url="https://image.yodobashi.com/product/100/000/009/002/935/105/100000009002935105_10204.jpg")

+ STEP1
 + https://ai.stanford.edu/~amaas/data/sentiment/  
 からダウンロードして、ファイルを適当なフォルダに格納する。
 
+ STEP2  
 + そのディレクトリに移動して、ターミナルで  
 tar -zxf aclImdb_v1.tar.gz  
 を実行する。


In [7]:
import tqdm
import pandas as pd
import os

basepath ='../../../../Documents/dataset/aclImdb/'
labels = { 'pos':1, 'neg':0}

df = pd.DataFrame()

for s in  ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in tqdm.tqdm( os.listdir(path) ):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            
df.columns = ['review', 'sentiment']

100%|██████████| 12500/12500 [00:25<00:00, 491.15it/s]
100%|██████████| 12500/12500 [00:31<00:00, 391.16it/s]
100%|██████████| 12500/12500 [00:36<00:00, 346.05it/s]
100%|██████████| 12500/12500 [00:39<00:00, 316.99it/s]


In [8]:
import numpy as np
np.random.seed(0)

df = df.reindex(np.random.permutation(df.index))

In [11]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [12]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


## BoWモデル

### 単語を特徴ベクトルに変換

In [17]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
                                'The sun is shining',
                                'The weather is sweet',
                                'The sun is shining, the weather is sweet, and one and one is two'
                            ])
bag = count.fit_transform(docs)

In [18]:
bag

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [19]:
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [20]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


### TF-IDFを使って単語の関連性を評価

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf = True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


### テキストデータのクレンジング

In [27]:
df.loc[0, 'review']

'My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of "Nasaan ka man" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!'

In [26]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '') )
    return text

In [28]:
preprocessor(df.loc[0, 'review'])

'my family and i normally do not watch local movies for the simple reason that they are poorly made they lack the depth and just not worth our time the trailer of nasaan ka man caught my attention my daughter in law s and daughter s so we took time out to watch it this afternoon the movie exceeded our expectations the cinematography was very good the story beautiful and the acting awesome jericho rosales was really very good so s claudine barretto the fact that i despised diether ocampo proves he was effective at his role i have never been this touched moved and affected by a local movie before imagine a cynic like me dabbing my eyes at the end of the movie congratulations to star cinema way to go jericho and claudine '

In [29]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :):(:)'

In [30]:
df['review'] = df['review'].apply(preprocessor)

### 文書をトークン化する

In [32]:
def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [37]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [33]:
import nltk

In [38]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Sakaue_2/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [39]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop ]

['runner', 'like', 'run', 'run', 'lot']

### 文書を分類するロジスティック回帰モデルのトレーニング

In [41]:
df.head()

Unnamed: 0,review,sentiment
0,my family and i normally do not watch local mo...,1
1,believe it or not this was at one time the wor...,0
2,after some internet surfing i found the homefr...,0
3,one of the most unheralded great works of anim...,1
4,it was the sixties and anyone with long hair a...,0


In [40]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                                       lowercase=False,
                                       preprocessor=None)

param_grid = [{'vect__ngram_range':[(1,1)],
                            'vect__stop_words':[stop, None],
                            'vect__tokenizer':[tokenizer, tokenizer_porter],
                            'clf__penalty':['l1', 'l2'],
                            'clf__C':[1.0, 10.0, 100.0]},
                          {'vect__ngram_range':[(1,1)],
                            'vect__stop_words':[stop, None],
                            'vect__tokenizer':[tokenizer, tokenizer_porter],
                            'vect__use_idf':[False],
                            'vect__norm':[None],
                            'clf__penalty':['l1', 'l2'],
                            'clf__C':[1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf),
                                 ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                                                  scoring='accuracy',
                                                  cv=5,
                                                  verbose=1,
                                                  n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 48.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 63.7min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(random_state=0))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 10.0, 100.0],
                          'clf__penalty': ['l1', 'l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd",...
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
           

63分かかった。

In [44]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x11def5ae8>} 


In [45]:
import pickle

# モデルを保存する
filename = 'gs_lr_tfidf.sav'
pickle.dump(gs_lr_tfidf, open(filename, 'wb'))

# 保存したモデルをロードする
loaded_model = pickle.load(open(filename, 'rb'))


# vectorizerを保存する
filename_tfidf = 'tfidf.sav'
pickle.dump(tfidf, open(filename_tfidf, 'wb'))

# 保存したvectorizerをロードする
loaded_tfidf = pickle.load(open(filename_tfidf, 'rb'))

## オンラインアルゴリズムとアウトオブコア学習

In [60]:
import numpy as np
import re

from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + '  '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop ]
    return tokenized

In [61]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # ヘッダーを読み飛ばす
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [62]:
next(stream_docs(path='movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [63]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
            
    except StopIteration:
        return None, None
    
    return docs, y

In [80]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore',
                                             n_features=2**21,
                                             preprocessor=None,
                                             tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter_no_change=1) # 
doc_stream = stream_docs(path='movie_data.csv')

In [81]:
classes = np.array([0, 1])

for _ in tqdm.tqdm(range(45)):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)

100%|██████████| 45/45 [00:24<00:00,  1.87it/s]


In [82]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)

In [83]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.866


In [85]:
# clf = clf.partial_fit(X_test, y_test)

## 潜在ディリクレ配分によるトピックモデルの構築

In [86]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english',
                                             max_df=.1,
                                             max_features=5000)
X = count.fit_transform(df['review'].values)

In [89]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, # n_topics→n_components
                                                      random_state=123,
                                                      learning_method='batch')
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i] for i in topic.argsort() [:-n_top_word - 1:-1] ]))

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1) )
    print(df['review'][movie_idx][:300], '...')