## 例3-1. nグラムの計算

In [2]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

# 最初の 10,000 件のレビューを読み込む
with open('data/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10000):
        js.append(json.loads(f.readline()))
review_df = pd.DataFrame(js)

# scikit-learn の CountVectorizer を使ってユニグラム（BoW）、
# バイグラム、トライグラムの特徴量変換器を作成する。
# CountVectorizer はデフォルトでは1文字の単語を無視するが、
# これは意味のない単語を除外するため実用的である。
# ただしここでは全ての単語を含むように設定している。
bow_converter = CountVectorizer(token_pattern='(?u)\\b\\w+\\b')
bigram_converter = CountVectorizer(ngram_range=(2,2), token_pattern='(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3), token_pattern='(?u)\\b\\w+\\b')

# 変換器を適用し、語彙数を確認する
bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names_out()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names_out()

trigram_converter.fit(review_df['text'])
trigrams = trigram_converter.get_feature_names_out()

print (len(words), len(bigrams), len(trigrams))

# n-グラムを確認する
words[:10]

29222 368943 881620


array(['0', '00', '000', '007', '00a', '00am', '00pm', '01', '02', '03'],
      dtype=object)

In [12]:
bigrams[-10:]

array(['zuzu was', 'zuzus room', 'zweigel wine', 'zwiebel kräuter',
       'zy world', 'zzed in', 'éclairs napoleons', 'école lenôtre',
       'ém all', 'òc châm'], dtype=object)

In [6]:
trigrams[:10]

array(['0 0 eye', '0 20 less', '0 39 oz', '0 39 pizza', '0 5 i',
       '0 50 to', '0 6 can', '0 75 oysters', '0 75 that', '0 75 to'],
      dtype=object)

## 例3-2. 品詞タグ付けとチャンク化

In [14]:
! pip install spacy

Collecting spacy
  Downloading spacy-3.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (128 kB)
[2K     [90m━━━━━━━━━

In [20]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
import pandas as pd
import json

# 最初の10レビューを読み込む
with open('data/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10):
        js.append(json.loads(f.readline()))
review_df = pd.DataFrame(js)

# まずは Spacy を使った方法
import spacy

# 言語モデル（英語）を読み込む
#nlp = spacy.load('en')
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm

# spaCy の言語モデルを使ってテキストから Pandas Series を作成する
doc_df = review_df['text'].apply(nlp)

# spaCy は細かい品詞タグを .pos_ で、粗い品詞タグを .tag_ で提供します
for doc in doc_df[4]:
    print([doc.text, doc.pos_, doc.tag_])

# spaCy は基本的な名詞句も .noun_chunks で提供します
print([chunk for chunk in doc_df[4].noun_chunks])

['General', 'PROPN', 'NNP']
['Manager', 'PROPN', 'NNP']
['Scott', 'PROPN', 'NNP']
['Petello', 'PROPN', 'NNP']
['is', 'AUX', 'VBZ']
['a', 'DET', 'DT']
['good', 'ADJ', 'JJ']
['egg', 'NOUN', 'NN']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['!', 'PUNCT', '.']
['Not', 'PART', 'RB']
['to', 'PART', 'TO']
['go', 'VERB', 'VB']
['into', 'ADP', 'IN']
['detail', 'NOUN', 'NN']
[',', 'PUNCT', ',']
['but', 'CCONJ', 'CC']
['let', 'VERB', 'VB']
['me', 'PRON', 'PRP']
['assure', 'VERB', 'VB']
['you', 'PRON', 'PRP']
['if', 'SCONJ', 'IN']
['you', 'PRON', 'PRP']
['have', 'VERB', 'VBP']
['any', 'DET', 'DT']
['issues', 'NOUN', 'NNS']
['(', 'PUNCT', '-LRB-']
['albeit', 'ADV', 'RB']
['rare', 'ADJ', 'JJ']
[')', 'PUNCT', '-RRB-']
['speak', 'VERB', 'VBP']
['with', 'ADP', 'IN']
['Scott', 'PROPN', 'NNP']
['and', 'CCONJ', 'CC']
['treat', 'VERB', 'VB']
['the', 'DET', 'DT']
['guy', 'NOUN', 'NN']
['with', 'ADP', 'IN']
['some', 'DET', 'DT']
['respect', 'NOUN', 'NN']
['as', 'SCONJ', 'IN']
['you', 'PRON', 'PRP']
['state', 'V