## 例3-1. nグラムの計算

In [2]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

# 最初の 10,000 件のレビューを読み込む
with open('../data/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10000):
        js.append(json.loads(f.readline()))
review_df = pd.DataFrame(js)

# scikit-learn の CountVectorizer を使ってユニグラム（BoW）、
# バイグラム、トライグラムの特徴量変換器を作成する。
# CountVectorizer はデフォルトでは1文字の単語を無視するが、
# これは意味のない単語を除外するため実用的である。
# ただしここでは全ての単語を含むように設定している。
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')

# 変換器を適用し、語彙数を確認する
bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names_out()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names_out()

trigram_converter.fit(review_df['text'])
trigrams = trigram_converter.get_feature_names_out()

print (len(words), len(bigrams), len(trigrams))

# n-グラムを確認する
words[:10]

29222 368943 881620


array(['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03'],
      dtype=object)

In [3]:
bigrams[-10:]

array(['zuzu was', 'zuzus room', 'zweigel wine', 'zwiebel kräuter',
       'zy world', 'zzed in', 'éclairs napoleons', 'école lenôtre',
       'ém all', 'òc châm'], dtype=object)

In [4]:
trigrams[:10]

array(['0 0 eye', '0 20 less', '0 39 oz', '0 39 pizza', '0 5 i',
       '0 50 to', '0 6 can', '0 75 oysters', '0 75 that', '0 75 to'],
      dtype=object)

## 例3-2. 品詞タグ付けとチャンク化

In [9]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from textblob import TextBlob

# TextBlob はデフォルトでは PatternTagger を使ってタグ付けを行う。
# これは今回の例ではうまくいくが、文法の正しくない文章を含む場合は 
# NLTKTagger を使うことをおすすめする。
blob_df = review_df['text'].apply(TextBlob)

blob_df[4].tags

[nltk_data] Downloading package punkt to /home/shion31/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/shion31/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('General', 'NNP'),
 ('Manager', 'NNP'),
 ('Scott', 'NNP'),
 ('Petello', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('egg', 'NN'),
 ('Not', 'RB'),
 ('to', 'TO'),
 ('go', 'VB'),
 ('into', 'IN'),
 ('detail', 'NN'),
 ('but', 'CC'),
 ('let', 'VB'),
 ('me', 'PRP'),
 ('assure', 'VB'),
 ('you', 'PRP'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('have', 'VBP'),
 ('any', 'DT'),
 ('issues', 'NNS'),
 ('albeit', 'IN'),
 ('rare', 'NN'),
 ('speak', 'NN'),
 ('with', 'IN'),
 ('Scott', 'NNP'),
 ('and', 'CC'),
 ('treat', 'VB'),
 ('the', 'DT'),
 ('guy', 'NN'),
 ('with', 'IN'),
 ('some', 'DT'),
 ('respect', 'NN'),
 ('as', 'IN'),
 ('you', 'PRP'),
 ('state', 'NN'),
 ('your', 'PRP$'),
 ('case', 'NN'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ("'d", 'MD'),
 ('be', 'VB'),
 ('surprised', 'VBN'),
 ('if', 'IN'),
 ('you', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('walk', 'VB'),
 ('out', 'RP'),
 ('totally', 'RB'),
 ('satisfied', 'JJ'),
 ('as', 'IN'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('did', 'VBD'),
 ('Like', 'IN'),
 ('

In [11]:
nltk.download('brown')
print([np for np in blob_df[4].noun_phrases])

[nltk_data] Downloading package brown to /home/shion31/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


['general manager', 'scott petello', 'good egg', 'scott', "n't walk", 'mistakes', 'thanks', 'scott', 'awesome staff']
