In [42]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# Lade die ersten 10.000 Rezensionen
f = open('data/yelp_academic_dataset_review.json')
js = []
for i in range(10000):
    js.append(json.loads(f.readline()))
f.close()
review_df = pd.DataFrame(js)


In [16]:
# Erzeuge Merkmaltransformatoren für Unigramme, Bigramme und Trigramme.
# Bei der Standardeinstellung werden Wörter mit einem Zeichen ignoriert. Dies ist in
# der Praxis hilfreich, da nicht informative Wörter ausgelassen werden. Wir beziehen
# sie jedoch zu Illustrationszwecken ausdrücklich in dieses Beispiel mit ein.

bow_converter = CountVectorizer(token_pattern = '(?u)\\b\\w+\\b')
bigram_converter = CountVectorizer(ngram_range=(2,2),
                                  token_pattern = '(?u)\\b\\w+\\b')
trigram_converter = CountVectorizer(ngram_range=(3,3),
                                  token_pattern = '(?u)\\b\\w+\\b')

# Passe die Transformatoren an und prüfe die Größe der Vokabulare

bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()
bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()
trigram_converter.fit(review_df['text'])
trigrams = trigram_converter.get_feature_names()


In [17]:
print(len(words),len(bigrams), len(trigrams))

26558 314991 734771


In [18]:
# Wirf einen Blick auf die n-Gramme selbst.
words[:10]

['0', '00', '000', '003', '00am', '00p', '00pm', '01', '02', '04']

In [19]:
bigrams[-10:]

['結合bar和保齡球 氣氛非常好',
 '虾 蟹也都不错',
 '蟹也都不错 鸡的味道一般',
 '蠻特別的地方 結合bar和保齡球',
 '还吃过什么忘记了 总体菜色味道都还挺好的',
 '静岡のさわやかはいつ東京にお店出してくれるのだろう あ',
 '餐馆依水 景致不错',
 '马兰头 the',
 '马兰头拌豆腐 the',
 '鸡的味道一般 还吃过什么忘记了']

In [20]:
trigrams[0:10]

['0 1 stars',
 '0 25 oysters',
 '0 30 possible',
 '0 4 miles',
 '0 40 oz',
 '0 46 an',
 '0 50 for',
 '0 50 on',
 '0 75 i',
 '0 99 or']

In [22]:
trigrams[-10:]

['生のポテトを使ったフライドポテトは好き嫌いあるかもですが ぼくは大好物 バーガーも',
 '結合bar和保齡球 氣氛非常好 就算沒有要打',
 '虾 蟹也都不错 鸡的味道一般',
 '蟹也都不错 鸡的味道一般 还吃过什么忘记了',
 '蠻特別的地方 結合bar和保齡球 氣氛非常好',
 '静岡のさわやかはいつ東京にお店出してくれるのだろう あ どこにも書いてないんですが',
 '餐馆依水 景致不错 在我看来只要不是特别惊艳或难吃',
 '马兰头 the dish',
 '马兰头拌豆腐 the vegetable',
 '鸡的味道一般 还吃过什么忘记了 总体菜色味道都还挺好的']

In [23]:
# Stemmer


import nltk
stemmer = nltk.stem.porter.PorterStemmer()

In [24]:
stemmer.stem('flowers')

'flower'

In [25]:
stemmer.stem('gone')

'gone'

In [26]:
stemmer.stem('zeroes')

'zero'

In [27]:
stemmer.stem('goes')

'goe'

In [29]:
# Funktion spaCy
import spacy
# Lade Sprachmodell
nlp = spacy.load('en_core_web_sm')

In [31]:
# Wir können eine Pandas-Reihe von NLP-Variablen aus spaCy erstellen
doc_df = review_df['text'].apply(nlp)

In [32]:
# spaCy liefert uns fein unterschiedene Wortarten mit (.pos_) und grob
# unterschiedene Wortarten mit (.tag_)
for doc in doc_df[4]:
    print([doc.text, doc.pos_, doc.tag_])

['The', 'DET', 'DT']
['food', 'NOUN', 'NN']
['is', 'AUX', 'VBZ']
['always', 'ADV', 'RB']
['great', 'ADJ', 'JJ']
['here', 'ADV', 'RB']
['.', 'PUNCT', '.']
['The', 'DET', 'DT']
['service', 'NOUN', 'NN']
['from', 'ADP', 'IN']
['both', 'CCONJ', 'CC']
['the', 'DET', 'DT']
['manager', 'NOUN', 'NN']
['as', 'ADV', 'RB']
['well', 'ADV', 'RB']
['as', 'ADP', 'IN']
['the', 'DET', 'DT']
['staff', 'NOUN', 'NN']
['is', 'AUX', 'VBZ']
['super', 'ADJ', 'JJ']
['.', 'PUNCT', '.']
['Only', 'ADV', 'RB']
['draw', 'VERB', 'VB']
['back', 'ADV', 'RB']
['of', 'ADP', 'IN']
['this', 'DET', 'DT']
['restaurant', 'NOUN', 'NN']
['is', 'AUX', 'VBZ']
['it', 'PRON', 'PRP']
["'s", 'AUX', 'VBZ']
['super', 'ADV', 'RB']
['loud', 'ADJ', 'JJ']
['.', 'PUNCT', '.']
['If', 'SCONJ', 'IN']
['you', 'PRON', 'PRP']
['can', 'AUX', 'MD']
[',', 'PUNCT', ',']
['snag', 'VERB', 'VB']
['a', 'DET', 'DT']
['patio', 'NOUN', 'NN']
['table', 'NOUN', 'NN']
['!', 'PUNCT', '.']


In [33]:
# spaCy nimmt für uns auch eine einfache Substantivprüfung vor.
print([chunk for chunk in doc_df[4].noun_chunks])

[The food, The service, both the manager, the staff, this restaurant, it, you, a patio table]


In [35]:
!pip3 install textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 4.0 MB/s eta 0:00:01
Installing collected packages: textblob
Successfully installed textblob-0.17.1
You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [36]:
# Wir können dieselben MErkmalstransformationen mit Textblob machen
from textblob import TextBlob

In [37]:
# Der voreingestellte Tagger in TextBlob verwendet den PatternTagger, was für unser Beispiel in Ordnung ist. Man
# kann auch den NLTK-TAgger auswählen, der für unvollständige Sätze besser funktioniert.
blob_df = review_df['text'].apply(TextBlob)

In [40]:
blob_df[4].tags

[('The', 'DT'),
 ('food', 'NN'),
 ('is', 'VBZ'),
 ('always', 'RB'),
 ('great', 'JJ'),
 ('here', 'RB'),
 ('The', 'DT'),
 ('service', 'NN'),
 ('from', 'IN'),
 ('both', 'CC'),
 ('the', 'DT'),
 ('manager', 'NN'),
 ('as', 'RB'),
 ('well', 'RB'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('staff', 'NN'),
 ('is', 'VBZ'),
 ('super', 'JJ'),
 ('Only', 'RB'),
 ('draw', 'VBZ'),
 ('back', 'RB'),
 ('of', 'IN'),
 ('this', 'DT'),
 ('restaurant', 'NN'),
 ('is', 'VBZ'),
 ('it', 'PRP'),
 ("'s", 'VBZ'),
 ('super', 'JJ'),
 ('loud', 'NN'),
 ('If', 'IN'),
 ('you', 'PRP'),
 ('can', 'MD'),
 ('snag', 'VB'),
 ('a', 'DT'),
 ('patio', 'NN'),
 ('table', 'NN')]

In [39]:
!python3 -m textblob.download_corpora

[nltk_data] Downloading package brown to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/nabizrahpoe/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [41]:
print([np for np in blob_df[4].noun_phrases])

['patio table']
