# Sprint 自然言語処理入門

In [1]:
# IMDBをカレントフォルダにダウンロード
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2020-07-29 08:28:23--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-07-29 08:28:29 (15.3 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
# 解凍
!tar zxf aclImdb_v1.tar.gz

In [3]:
# aclImdb/train/unsupはラベル無しのため削除
!rm -rf aclImdb/train/unsup

In [4]:
# IMDBデータセットの説明を表示
!cat aclImdb/README

Large Movie Review Dataset v1.0

Overview

This dataset contains movie reviews along with their associated binary
sentiment polarity labels. It is intended to serve as a benchmark for
sentiment classification. This document outlines how the dataset was
gathered, and how to use the files provided. 

Dataset 

The core dataset contains 50,000 reviews split evenly into 25k train
and 25k test sets. The overall distribution of labels is balanced (25k
pos and 25k neg). We also include an additional 50,000 unlabeled
documents for unsupervised learning. 

In the entire collection, no more than 30 reviews are allowed for any
given movie because reviews for the same movie tend to have correlated
ratings. Further, the train and test sets contain a disjoint set of
movies, so no significant performance is obtained by memorizing
movie-unique terms and their associated with observed labels.  In the
labeled train/test sets, a negative review has a score <= 4 out of 10,
and a positive review has a scor

### データの読み込み

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re

In [6]:
from sklearn.datasets import load_files
train_review = load_files('./aclImdb/train/', encoding='utf-8')
x_train, y_train = train_review.data, train_review.target
test_review = load_files('./aclImdb/test/', encoding='utf-8')
x_test, y_test = test_review.data, test_review.target
# ラベルの0,1 と意味の対応の表示
print(train_review.target_names)

['neg', 'pos']


In [7]:
# 中身の確認
print("x :{}".format(x_train[0]))

x :Zero Day leads you to think, even re-think why two boys/young men would do what they did - commit mutual suicide via slaughtering their classmates. It captures what must be beyond a bizarre mode of being for two humans who have decided to withdraw from common civility in order to define their own/mutual world via coupled destruction.<br /><br />It is not a perfect movie but given what money/time the filmmaker and actors had - it is a remarkable product. In terms of explaining the motives and actions of the two young suicide/murderers it is better than 'Elephant' - in terms of being a film that gets under our 'rationalistic' skin it is a far, far better film than almost anything you are likely to see. <br /><br />Flawed but honest with a terrible honesty.


In [9]:
mini_dataset = \
  ["This movie is very good.",
  "This film is a good",
  "Very bad. Very, very bad."]

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'(?u)\b\w+\b')
bow = (vectorizer.fit_transform(mini_dataset)).toarray()

df = pd.DataFrame(bow, columns=vectorizer.get_feature_names())
display(df)

Unnamed: 0,a,bad,film,good,is,movie,this,very
0,0,0,0,1,1,1,1,1
1,1,0,1,1,1,0,1,0
2,0,2,0,0,0,0,0,3


In [11]:
vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'(?u)\b\w+\b')
bow_train = (vectorizer.fit_transform(mini_dataset)).toarray()
print(vectorizer.get_feature_names())
df = pd.DataFrame(bow_train, columns=vectorizer.get_feature_names())
display(df)

['a good', 'bad very', 'film is', 'is a', 'is very', 'movie is', 'this film', 'this movie', 'very bad', 'very good', 'very very']


Unnamed: 0,a good,bad very,film is,is a,is very,movie is,this film,this movie,very bad,very good,very very
0,0,0,0,0,1,1,0,1,0,1,0
1,1,0,1,1,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,2,0,1


## 【問題1】BoWのスクラッチ実装

以下の3文のBoWを求められるプログラムをscikit-learnを使わずに作成してください。1-gramと2-gramで計算してください。

>This movie is SOOOO funny!!!  
What a movie! I never  
best movie ever!!!!! this movie

In [12]:
mini_dataset = \
  ["This movie is SOOOO funny!!!",
  "What a movie! I never",
  "best movie ever!!!!! this movie"]

In [13]:
class BoW_1_gram():
  def __init__(self, token_pattern=r'(?u)\b\w+\b'):
    self.token_pattern = token_pattern
  
  def fit(self, sentence):
    """
    文を引数として受け取り、コーパスをndarray型で返す
    """
    sentence_num = len(sentence)
    
    #正規表現で分割
    splits = []
    for s in sentence:
      splits.append(re.findall(self.token_pattern, s))
    
    #小文字化
    lower = []
    for l in splits:
      lower.append([str.lower() for str in l])

    #flattenして重複排除
    labels = list(set(sum(lower, [])))

    copus = np.empty((sentence_num,len(labels)))
    for i ,label in enumerate(labels):
      for row, value in enumerate(lower):
        copus[row, i] =  value.count(label)
    
    self.labels = labels

    return copus

In [14]:
gram = BoW_1_gram()
result = gram.fit(mini_dataset)

# print(gram.labels)
print(result)

[[1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 2. 1. 1. 0.]]


In [15]:
class BoW_2_gram(BoW_1_gram):
    def fit(self, sentence):
        
      sentence_num = len(sentence)

      splits = []
      for s in sentence:
        splits.append(re.findall(self.token_pattern, s))
      
      lower = []
      for l in splits:
        lower.append([str.lower() for str in l])

      two_words = []
      for s in lower:
        temp = []
        for i in range(len(s)-1):
          temp.append(s[i]+" "+s[i+1])
        two_words.append(temp)

      labels = list(set(sum(two_words, [])))

      copus = np.empty((sentence_num,len(labels)))
      for i ,label in enumerate(labels):
        for row, value in enumerate(two_words):
          copus[row, i] =  value.count(label)
      
      self.labels = labels

      return copus

In [16]:
gram2 = BoW_2_gram()
result2 = gram2.fit(mini_dataset)

#print(gram2.labels)
print(result2)

[[1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1.]
 [0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0.]]


## 【問題2】TF-IDFの計算
IMDB映画レビューデータセットをTF-IDFによりベクトル化してください。NLTKのストップワードを利用し、最大の語彙数は5000程度に設定してください。テキストクリーニングやステミングなどの前処理はこの問題では要求しません。

In [20]:
vectorizer = CountVectorizer(stop_words=["is"], token_pattern=r'\b\w+\b')
bow_train = (vectorizer.fit_transform(mini_dataset)).toarray()
df = pd.DataFrame(bow_train, columns=vectorizer.get_feature_names())
display(df)

Unnamed: 0,a,best,ever,funny,i,movie,never,soooo,this,what
0,0,0,0,1,0,1,0,1,1,0
1,1,0,0,0,1,1,1,0,0,1
2,0,1,1,0,0,2,0,0,1,0


In [21]:
# はじめて使う場合はストップワードをダウンロード
import nltk
stop_words = nltk.download('stopwords')

from nltk.corpus import stopwords
nltk_stop_words = stopwords.words('english')
print("stop word : {}".format(nltk_stop_words)) # 'i', 'me', 'my', ...

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
stop word : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all'

In [22]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b', max_features = 5)
bow_train = (vectorizer.fit_transform(mini_dataset)).toarray()
df = pd.DataFrame(bow_train, columns=vectorizer.get_feature_names())
display(df)

Unnamed: 0,a,best,ever,movie,this
0,0,0,0,1,1
1,1,0,0,1,0
2,0,1,1,2,1


In [23]:
vectorizer = CountVectorizer(stop_words=nltk_stop_words, token_pattern=r'\b\w+\b', max_features = 5000)
bow_train = (vectorizer.fit_transform(x_train)).toarray()
print(bow_train.shape)

(25000, 5000)


# 【問題3】TF-IDFを用いた学習
問題2で求めたベクトルを用いてIMDB映画レビューデータセットの学習・推定を行なってください。モデルは2値分類が行える任意のものを利用してください。


ここでは精度の高さは求めませんが、最大の語彙数やストップワード、n-gramの数を変化させて影響を検証してみてください。



In [24]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow と tf.keras のインポート
import tensorflow as tf
from tensorflow import keras
import keras.backend as K

Using TensorFlow backend.


#### NNモデル

In [25]:
model = keras.Sequential([
    keras.layers.Dense(10000, activation='relu', input_shape=(5000,)),
    keras.layers.Dense(6000, activation='relu'),
    keras.layers.Dense(3000, activation='relu'),
    keras.layers.Dense(1000, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10000)             50010000  
_________________________________________________________________
dense_1 (Dense)              (None, 6000)              60006000  
_________________________________________________________________
dense_2 (Dense)              (None, 3000)              18003000  
_________________________________________________________________
dense_3 (Dense)              (None, 1000)              3001000   
_________________________________________________________________
dense_4 (Dense)              (None, 100)               100100    
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 202       
Total params: 131,120,302
Trainable params: 131,120,302
Non-trainable params: 0
__________________________________________

In [26]:
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(bow_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f08171e0ba8>

In [27]:
vectorizer2 = CountVectorizer(stop_words=nltk_stop_words, token_pattern=r'\b\w+\b', max_features = 5000)
bow_test = (vectorizer2.fit_transform(x_test)).toarray()
loss, accuracy = model.evaluate(bow_test, y_test, verbose=0)
print(accuracy)


0.489439994096756


In [28]:
bow_test_same = (vectorizer.transform(x_test)).toarray()
loss2, accuracy2 = model.evaluate(bow_test_same, y_test, verbose=0)
print(accuracy2)

0.8624799847602844


#### TF-IDFモデル

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tf = TfidfVectorizer(stop_words=nltk_stop_words, token_pattern=r'\b\w+\b', max_features = 5000)
bow_train_tf = (vectorizer_tf.fit_transform(x_train)).toarray()
test_tf = (vectorizer_tf.transform(x_test)).toarray()

In [30]:
#K.clear_session()

model2 = keras.Sequential([
    keras.layers.Dense(10000, activation='relu', input_shape=(5000,)),
    keras.layers.Dense(6000, activation='relu'),
    keras.layers.Dense(3000, activation='relu'),
    keras.layers.Dense(1000, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(2, activation='softmax')
])

model2.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model2.fit(bow_train_tf, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0813724b38>

In [31]:
loss, accuracy = model2.evaluate(test_tf, y_test, verbose=0)
print(accuracy)

0.8669599890708923


## 【問題4】TF-IDFのスクラッチ実装 
以下の3文のTF-IDFを求められるプログラムをscikit-learnを使わずに作成してください。標準的な式と、scikit-learnの採用している式の2種類を作成してください。正規化は不要です。

In [34]:
test_dataset = [
                "This movie is SOOOO funny!!!", 
                "What a movie! I never", 
                "best movie ever!!!!! this movie"
]

#### 標準

In [32]:
class TFIDF_1_gram():
  def __init__(self, token_pattern=r'(?u)\b\w+\b'):
    self.token_pattern = token_pattern
  
  def fit(self, sentence):
    sentence_num = len(sentence)
    
    splits = []
    for s in sentence:
      splits.append(re.findall(self.token_pattern, s))
    
    lower = []
    for l in splits:
      lower.append([str.lower() for str in l])

    labels = list(set(sum(lower, [])))

    copus = np.empty((sentence_num,len(labels)))
    for i ,label in enumerate(labels):
      for row, value in enumerate(lower):
        copus[row, i] =  value.count(label)
    
    self.labels = labels
    self.copus = copus

    self.tf = np.zeros(copus.shape)
    for i, c in enumerate(list(copus)):
      for j, value in enumerate(c):
        self.tf[i, j] = value/sum(c)
  
    self.idf = np.log(sentence_num / np.count_nonzero(copus>0, axis=0))
    
    self.tfidf = self.tf * self.idf

    return copus * self.tfidf

In [35]:
tfidf = TFIDF_1_gram()
result = tfidf.fit(test_dataset)

print("copus\n", tfidf.copus)
print("tf\n",tfidf.tf)
print("idf\n",tfidf.idf)
print("tfidf\n", tfidf.tfidf )
print("result\n" ,result)

copus
 [[1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 2. 1. 1. 0.]]
tf
 [[0.2 0.  0.2 0.2 0.2 0.  0.  0.2 0.  0.  0. ]
 [0.  0.2 0.  0.  0.  0.2 0.2 0.2 0.  0.  0.2]
 [0.  0.  0.  0.  0.2 0.  0.  0.4 0.2 0.2 0. ]]
idf
 [1.09861229 1.09861229 1.09861229 1.09861229 0.40546511 1.09861229
 1.09861229 0.         1.09861229 1.09861229 1.09861229]
tfidf
 [[0.21972246 0.         0.21972246 0.21972246 0.08109302 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.21972246 0.         0.         0.         0.21972246
  0.21972246 0.         0.         0.         0.21972246]
 [0.         0.         0.         0.         0.08109302 0.
  0.         0.         0.21972246 0.21972246 0.        ]]
result
 [[0.21972246 0.         0.21972246 0.21972246 0.08109302 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.21972246 0.         0.         0.         0.21972246
  0.21972246 0.         0.         0.        

#### sklearn

In [36]:
class TFIDF_1_gram_SK():
  def __init__(self, token_pattern=r'(?u)\b\w+\b'):
    self.token_pattern = token_pattern
  
  def fit(self, sentence):
    sentence_num = len(sentence)
    
    splits = []
    for s in sentence:
      splits.append(re.findall(self.token_pattern, s))
    
    lower = []
    for l in splits:
      lower.append([str.lower() for str in l])

    labels = list(set(sum(lower, [])))

    copus = np.empty((sentence_num,len(labels)))
    for i ,label in enumerate(labels):
      for row, value in enumerate(lower):
        copus[row, i] =  value.count(label)
    
    self.labels = labels
    self.copus = copus

    #tf計算
    self.tf = copus
  
    self.idf = np.log(sentence_num + 1 / np.count_nonzero(copus>0, axis=0) + 1) + 1
    
    self.tfidf = self.tf * self.idf

    return copus * self.tfidf

In [37]:
tfidf2 = TFIDF_1_gram_SK()
result2 = tfidf2.fit(test_dataset)

print("copus\n", tfidf2.copus)
print("tf\n",tfidf2.tf)
print("idf\n",tfidf2.idf)
print("tfidf\n", tfidf2.tfidf )
print("result\n" ,result2)

copus
 [[1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 2. 1. 1. 0.]]
tf
 [[1. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0. 0. 2. 1. 1. 0.]]
idf
 [2.60943791 2.60943791 2.60943791 2.60943791 2.5040774  2.60943791
 2.60943791 2.46633707 2.60943791 2.60943791 2.60943791]
tfidf
 [[2.60943791 0.         2.60943791 2.60943791 2.5040774  0.
  0.         2.46633707 0.         0.         0.        ]
 [0.         2.60943791 0.         0.         0.         2.60943791
  2.60943791 2.46633707 0.         0.         2.60943791]
 [0.         0.         0.         0.         2.5040774  0.
  0.         4.93267414 2.60943791 2.60943791 0.        ]]
result
 [[2.60943791 0.         2.60943791 2.60943791 2.5040774  0.
  0.         2.46633707 0.         0.         0.        ]
 [0.         2.60943791 0.         0.         0.         2.60943791
  2.60943791 2.46633707 0.         0.         2.60943791]
 [0.         0.     

## 【問題5】コーパスの前処理
コーパスの前処理として、特殊文字（!など）やURLの除去、大文字の小文字化といったことを行なってください。また、単語（トークン）はリストで分割してください。

In [38]:
splits = []
for s in x_test:
  splits.append(re.findall(r'(?u)\b\w+\b', s))

lower = []
for l in splits:
  lower.append([str.lower() for str in l])

print(len(lower))

25000


## 【問題6】Word2Vecの学習
Word2Vecの学習を行なってください。



In [40]:
from gensim.models import Word2Vec
sentences = [['this', 'movie', 'is', 'very', 'good'], ['this', 'film', 'is', 'a', 'good'], ['very', 'bad', 'very', 'very', 'bad']]
model = Word2Vec(min_count=1, size=10) # 次元数を10に設定
model.build_vocab(sentences) # 準備
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # 学習
print("語彙の一覧 : {}".format(model.wv.vocab.keys()))
for vocab in model.wv.vocab.keys():
  print("{}のベクトル : \n{}".format(vocab, model.wv[vocab]))

語彙の一覧 : dict_keys(['this', 'movie', 'is', 'very', 'good', 'film', 'a', 'bad'])
thisのベクトル : 
[ 0.0249938  -0.03765341  0.0195027   0.00524712 -0.00153644 -0.03273581
  0.02355515 -0.01225791  0.01114472 -0.00907992]
movieのベクトル : 
[ 0.00848707  0.02872956 -0.0244792  -0.03368572  0.0108247   0.04854515
  0.00619196 -0.02328659 -0.03074072  0.03229745]
isのベクトル : 
[-0.00286664 -0.00359677  0.00749168  0.0314682   0.01639835 -0.01668038
  0.00981281 -0.04795818  0.01957807 -0.00324648]
veryのベクトル : 
[-0.02707146  0.0428793   0.04621729 -0.04383265  0.0194873  -0.02036348
 -0.03272589 -0.04095151 -0.03248588 -0.00572642]
goodのベクトル : 
[-0.00238406  0.01335714  0.02903937 -0.00371578 -0.0458021   0.04129339
  0.02749515 -0.00501445 -0.03773468 -0.03934373]
filmのベクトル : 
[ 0.02758595  0.00256787 -0.02125228  0.00187987  0.01038363  0.01999831
 -0.04292028  0.01096623 -0.00348892  0.00390196]
aのベクトル : 
[ 0.00601908 -0.00562103  0.0379864  -0.00409364  0.03662813  0.00152588
  0.01260309 -0.0362146

  """


In [41]:
model = Word2Vec(min_count=1, size=10) 
model.build_vocab(lower) 
model.train(lower, total_examples=model.corpus_count, epochs=model.iter) 

  This is separate from the ipykernel package so we can avoid doing imports until


(22107793, 29600110)