## 例3-1. nグラムの計算

In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

# 最初の 10,000 件のレビューを読み込む
with open('data/yelp/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10000):
        js.append(json.loads(f.readline()))
review_df = pd.DataFrame(js)

# scikit-learn の CountVectorizer を使ってユニグラム（BoW）、
# バイグラム、トライグラムの特徴量変換器を作成する。
# CountVectorizer はデフォルトでは1文字の単語を無視するが、
# これは意味のない単語を除外するため実用的である。
# ただしここでは全ての単語を含むように設定している。
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')

# 変換器を適用し、語彙数を確認する
bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()

trigram_converter.fit(review_df['text'])
trigrams = trigram_converter.get_feature_names()

print (len(words), len(bigrams), len(trigrams))

# n-グラムを確認する
words[:10]

29222 368943 881620


['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03']

In [3]:
bigrams[-10:]

['zuzu was',
 'zuzus room',
 'zweigel wine',
 'zwiebel kräuter',
 'zy world',
 'zzed in',
 'éclairs napoleons',
 'école lenôtre',
 'ém all',
 'òc châm']

In [4]:
trigrams[:10]

['0 0 eye',
 '0 20 less',
 '0 39 oz',
 '0 39 pizza',
 '0 5 i',
 '0 50 to',
 '0 6 can',
 '0 75 oysters',
 '0 75 that',
 '0 75 to']

## 例3-2. 品詞タグ付けとチャンク化

In [2]:
import pandas as pd
import json

# 最初の10レビューを読み込む
with open('data/yelp/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10):
        js.append(json.loads(f.readline()))
review_df = pd.DataFrame(js)

# まずは Spacy を使った方法
import spacy
# 言語モデル（英語）を読み込む
nlp = spacy.load('en')

# spaCy の言語モデルを使ってテキストから Pandas Series を作成する
doc_df = review_df['text'].apply(nlp)

# spaCy は細かい品詞タグを .pos_ で、粗い品詞タグを .tag_ で提供します
for doc in doc_df[4]:
    print([doc.text, doc.pos_, doc.tag_])

# spaCy は基本的な名詞句も .noun_chunks で提供します
print([chunk for chunk in doc_df[4].noun_chunks])

['General', 'PROPN', 'NNP']
['Manager', 'PROPN', 'NNP']
['Scott', 'PROPN', 'NNP']
['Petello', 'PROPN', 'NNP']
['is', 'VERB', 'VBZ']
['a', 'DET', 'DT']
['good', 'ADJ', 'JJ']
['egg', 'NOUN', 'NN']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['Not', 'ADV', 'RB']
['to', 'PART', 'TO']
['go', 'VERB', 'VB']
['into', 'ADP', 'IN']
['detail', 'NOUN', 'NN']
[',', 'PUNCT', ',']
['but', 'CCONJ', 'CC']
['let', 'VERB', 'VB']
['me', 'PRON', 'PRP']
['assure', 'VERB', 'VB']
['you', 'PRON', 'PRP']
['if', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['have', 'VERB', 'VBP']
['any', 'DET', 'DT']
['issues', 'NOUN', 'NNS']
['(', 'PUNCT', '-LRB-']
['albeit', 'ADP', 'IN']
['rare', 'ADJ', 'JJ']
[')', 'PUNCT', '-RRB-']
['speak', 'VERB', 'VBP']
['with', 'ADP', 'IN']
['Scott', 'PROPN', 'NNP']
['and', 'CCONJ', 'CC']
['treat', 'VERB', 'VB']
['the', 'DET', 'DT']
['guy', 'NOUN', 'NN']
['with', 'ADP', 'IN']
['some', 'DET', 'DT']
['respect', 'NOUN', 'NN']
['as', 'ADP', 'IN']
['you', 'PRON', 'PRP']
['state', 'VERB'

In [7]:
# TextBlob ライブラリを使って同じことができる
from textblob import TextBlob

# TextBlob はデフォルトでは PatternTagger を使ってタグ付けを行う。
# これは今回の例ではうまくいくが、文法の正しくない文章を含む場合は 
# NLTKTagger を使うことをおすすめする。
blob_df = review_df['text'].apply(TextBlob)

blob_df[4].tags

[('General', 'NNP'),
 ('Manager', 'NNP'),
 ('Scott', 'NNP'),
 ('Petello', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('egg', 'NN'),
 ('Not', 'RB'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('into', 'IN'),
 ('detail', 'NN'),
 ('but', 'CC'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('assure', 'VB'),
 ('you', 'PRP'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('have', 'VBP'),
 ('any', 'DT'),
 ('issues', 'NNS'),
 ('albeit', 'IN'),
 ('rare', 'NN'),
 ('speak', 'NN'),
 ('with', 'IN'),
 ('Scott', 'NNP'),
 ('and', 'CC'),
 ('treat', 'VB'),
 ('the', 'DT'),
 ('guy', 'NN'),
 ('with', 'IN'),
 ('some', 'DT'),
 ('respect', 'NN'),
 ('as', 'IN'),
 ('you', 'PRP'),
 ('state', 'NN'),
 ('your', 'PRP$'),
 ('case', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ("'d", 'MD'),
 ('be', 'VB'),
 ('surprised', 'VBN'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('walk', 'VB'),
 ('out', 'RP'),
 ('totally', 'RB'),
 ('satisfied', 'JJ'),
 ('as', 'IN'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('did', 'VBD'),
 ('Like', 'IN'),
 ('

In [8]:
print([np for np in blob_df[4].noun_phrases])

['general manager', 'scott petello', 'good egg', 'scott', "n't walk", '... ..', 'mistakes', 'thanks', 'scott', 'awesome staff', '... ... ...']
