import tarfile
with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as tar:
    tar.extractall()

In [6]:
import pandas as pd
import os
import sys
import numpy as np

basepath = "aclImdb"
labels = {"pos": 1, "neg": 0}
df = pd.DataFrame()
for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), "r", encoding = "utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index = True)
            
df.columns = ["review", "sentiment"]

#シャッフルしてcsvに格納
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("movie_data.csv", index = False, encoding = "utf-8")

In [7]:
df = pd.read_csv("movie_data.csv", encoding = "utf-8")
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [8]:
#BoWモデル(例)
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
    "The sun is shining", 
    "The weather is sweet", 
    "The sun is shining, the weather is sweet, and one and one is two"])
bag = count.fit_transform(docs)

In [9]:
print(count.vocabulary_)
print(count.get_feature_names_out())

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}
['and' 'is' 'one' 'shining' 'sun' 'sweet' 'the' 'two' 'weather']


In [10]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [11]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf = True, norm = "l2", smooth_idf = True)
np.set_printoptions(precision = 2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [12]:
import re
def preprocessor(text): #HTMLタグを削除、顔文字を判別
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = (re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", ""))
    return text

In [13]:
preprocessor(df.loc[0, "review"][-50:])
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [14]:
df["review"] = df["review"].apply(preprocessor)

In [15]:
#トークン化
def tokenizer(text):
    return text.split()
tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [16]:
#Porterステミングで単語を原型化
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [17]:
#ストップワードを除去
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sqrt_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
[w for w in tokenizer_porter("a runner likes running and runs a lot") if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [19]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values
X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents = None, lowercase = False, preprocessor = None)
small_param_grid = [{"vect__ngram_range": [(1, 1)],
                     "vect__stop_words": [None],
                     "vect__tokenizer": [tokenizer, tokenizer_porter],
                     "clf__penalty": ["l2"],
                     "clf__C": [1.0, 10.0]},
                    {"vect__ngram_range": [(1, 1)],
                     "vect__stop_words": [stop, None],
                     "vect__tokenizer": [tokenizer],
                     "vect__use_idf": [False],
                     "vect__norm": [None],
                     "clf__penalty": ["l2"],
                     "clf__C": [1.0, 10.0]}]
lr_tfidf = Pipeline([("vect", tfidf), ("clf", LogisticRegression(solver = "liblinear"))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, scoring = "accuracy", cv = 5, verbose = 2, n_jobs = 1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x00000285ECFBCF70>; total time=   3.2s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x00000285ECFBCF70>; total time=   3.5s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x00000285ECFBCF70>; total time=   3.2s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x00000285ECFBCF70>; total time=   3.1s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<function tokenizer at 0x00000285ECFBCF70>; total time=   3.2s
[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__stop_words=None, vect__tokenizer=<functi

[CV] END clf__C=1.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', '

[CV] END clf__C=10.0, clf__penalty=l2, vect__ngram_range=(1, 1), vect__norm=None, vect__stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(lowercase=False)),
                                       ('clf',
                                        LogisticRegression(solver='liblinear'))]),
             n_jobs=1,
             param_grid=[{'clf__C': [1.0, 10.0], 'clf__penalty': ['l2'],
                          'vect__ngram_range': [(1, 1)],
                          'vect__stop_words': [None],
                          'vect__tokenizer': [<function tokenizer at 0x00000285ECFBCF70>,
                                              <function tokenizer_porter at 0x00000285F...
                          'vect__stop_words': [['i', 'me', 'my', 'myself', 'we',
                                                'our', 'ours', 'ourselves',
                                                'you', "you're", "you've",
                                                "you'll", "you'd", 'your',
                                 

In [22]:
print(f"Best parameter set: {gs_lr_tfidf.best_params_}")

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x00000285ECFBCF70>}


In [23]:
print(f"CV accuracy: {gs_lr_tfidf.best_score_:.3f}")
clf = gs_lr_tfidf.best_estimator_
print(f"Test accuracy: {clf.score(X_test, y_test):.3f}")

CV accuracy: 0.897
Test accuracy: 0.899


In [29]:
#逐次的なモデル構築
stop = stopwords.words("english")
def tokenizer(text): #トークン化
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = (re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", ""))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [30]:
def stream_docs(path): #文書を一つずつ読み込み返す
    with open(path, "r", encoding = "utf-8") as csv:
        next(csv) #ヘッダーをスキップ
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [34]:
def get_minibatch(doc_stream, size): #stream_docsから受け取ったストリームからsize個の文書を返す
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [46]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error = "ignore", n_features = 2**21, preprocessor = None, tokenizer = tokenizer)
clf = SGDClassifier(loss = "log", random_state = 1)
doc_stream = stream_docs(path = "movie_data.csv")

In [47]:
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print(f"Accuracy: {clf.score(X_test, y_test):.3f}")

Accuracy: 0.868


In [48]:
clf = clf.partial_fit(X_test, y_test)

In [51]:
#LDAを用いたトピック分析
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = "english", max_df = .1, max_features = 5000)
X = count.fit_transform(df["review"].values)

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 10, random_state = 123, learning_method = "batch")
X_topics = lda.fit_transform(X)

In [52]:
lda.components_.shape

(10, 5000)

In [59]:
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {(topic_idx +1)}:")
    print(" ".join([feature_names[i] for i in topic.argsort() [:-n_top_words -1:-1]]))

Topic 1:
horror original comedy black house
Topic 2:
worst minutes guy script money
Topic 3:
book dvd read version watched
Topic 4:
family performance father beautiful mother
Topic 5:
series episode tv kids comedy
Topic 6:
murder police wife john plays
Topic 7:
documentary camera effects audience sense
Topic 8:
music song songs musical role
Topic 9:
horror effects guy dead budget
Topic 10:
action war game fight american


In [60]:
horror = X_topics[:, 0].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print(f"\nHorror movie #{iter_idx + 1}:")
    print(df["review"][movie_idx][:300], "...")


Horror movie #1:
george zucco was like boris karloff in the fact no matter how poor the film he appeared in was he would always maintain a sense of dignity and turn in a fine performance the mad monster is no exception to that rule it is by all standards a poor if entertaining film the filmmakers obviously didn t kn ...

Horror movie #2:
 house of dracula is a good sequel to house of frankenstein there isn t as much action but the acting is just as good onslow stevens is the benevolent doctor who turns bad after receiving blood from dracula via a transfusion dracula was actually receiving the transfusion to overcome his affliction b ...

Horror movie #3:
castle of blood is a good example of the quality work in the horror genre being turned out in italy in the 60s the film has all of the right elements old dark house atmosphere a decent story and barbara steele steele makes most any film worth seeing the story concerns a haunted castle people have vi ...
