# インストール

In [1]:
!pip install nlpaug numpy matplotlib python-dotenv



In [2]:
!pip install torch==1.2.0 transformers==2.5.0



In [3]:
!pip install nltk==3.4.5



# モデルダウンロード

In [0]:
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_word2vec(dest_dir='.')  # Download word2vec model
DownloadUtil.download_glove(model_name='glove.6B', dest_dir='.')  # Download GloVe model
DownloadUtil.download_fasttext(model_name='wiki-news-300d-1M', dest_dir='.')  # Download fasttext model

# インポート

In [0]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [2]:
text = 'I am a programmer. We do not worry about warnings. We only worry about errors.'
print(text)



# Character Augmenter
文字単位でのデータ拡張

### Keyboard Augmenter
概要：キーボードの打ち間違いを模したデータ拡張を行う。<br>
動作：置換<br>
例　：oとp、qとaなどのキー位置が近いもの

In [3]:
aug = nac.KeyboardAug()
augmented_text = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I am a programmer . We do not worry about 1arhings . We Inly worry about errors .


### OCR Augmenter
概要：OCRの検出ミスを模したデータ拡張を行う。<br>
動作：置換<br>
例　：O(オー)と0(ゼロ)、1(イチ)とI(アイ)など

In [4]:
aug = nac.OcrAug()
augmented_text = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I am a programmer . We do n0t worry about waknin9s . We on1y wurry about errors .


### Random Augmenter
概要：ランダムにデータ拡張を行う。<br>
動作：挿入、置換、入替、削除から選択可能

In [5]:
aug = nac.RandomCharAug()
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I am a programmer . We do not worry about war_ing& . We only Xorry about er5ors .


Random Augmenterにはactionパラメータで動作を指定可能。
* substitute：ランダムに文字を置換する（デフォルト動作）
* insert    ：ランダムに文字を挿入する
* swap      ：ランダムに文字を入れ替える
* delete    ：ランダムに文字を消去する

In [6]:
aug = nac.RandomCharAug(action='insert')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [7]:
aug = nac.RandomCharAug(action='swap')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [8]:
aug = nac.RandomCharAug(action='delete')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



# Word Augmenter
単語単位でのデータ拡張

### Synonym Augmenter
概要：WordNetの類義語を用い単語を置き換える<br>
動作：置換

参考：https://ja.wikipedia.org/wiki/WordNet<br>
WordNet（ワードネット）は英語の概念辞書（意味辞書）である。<br>WordNetでは英単語がsynsetと呼ばれる同義語のグループに分類され、<br>簡単な定義や、他の同義語のグループとの関係が記述されている。<br><br>別途、PPDBをインストールすることでエンジンをPPDBに変更可能。<br>
→
aug = naw.SynonymAug(aug_src='ppdb', model_path=os.environ.get("MODEL_DIR") + 'ppdb-2.0-s-all')

In [9]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I am a programmer . We manage non worry about monition . We merely worry astir errors .


### Antonym Augmenter
概要：WordNetの反意語を用い単語を置き換える<br>
動作：置換

In [11]:
aug = naw.AntonymAug()
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



### Contextual Word Embeddings Augmenter
概要：BERT、DistilBERT、RoBERTa、またはXLNetモデルを用いて、文意に適した単語でデータ拡張を行う。<br>
動作：挿入、置換から選択可能
* insert    ：単語を挿入する
* substitute：単語を置換する


In [12]:
# BERT：INSERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [13]:
# BERT：SUBSTITUTE
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:he am a scientist . we do not worry about cost . we also care about errors .


In [14]:
# DistilBERT：INSERT
aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [15]:
# DistilBERT：SUBSTITUTE
aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:i detect a pirate . we do not be near bugs . we only worry about errors .


In [16]:
# RoBERTa：INSERT
aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [17]:
# RoBERTa：SUBSTITUTE
aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [18]:
# XLNet：INSERT
aug = naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [19]:
# XLNet：SUBSTITUTE
aug = naw.ContextualWordEmbsAug(model_path='xlnet-base-cased', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



### Random Word Augmenter
概要：ランダムに単語の入替/削除を行う<br>
動作：入替、削除から選択可能
* swap      ：ランダムに文字を入れ替える
* delete    ：ランダムに文字を消去する

In [20]:
aug = naw.RandomWordAug(action='swap')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [21]:
aug = naw.RandomWordAug(action='delete')
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



### Spelling Augmenter
概要：スペルミス辞書を用いてデータ拡張を行う<br>
動作：置換

In [22]:
!wget https://raw.githubusercontent.com/makcedward/nlpaug/master/model/spelling_en.txt -P ./

--2020-05-27 16:30:30--  https://raw.githubusercontent.com/makcedward/nlpaug/master/model/spelling_en.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 543624 (531K) [text/plain]
Saving to: ‘./spelling_en.txt.1’


2020-05-27 16:30:31 (7.33 MB/s) - ‘./spelling_en.txt.1’ saved [543624/543624]



In [23]:
aug = naw.SpellingAug('./spelling_en.txt')
augmented_text = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



### Split Augmenter
概要：単語を分割する<br>
動作：分割

In [24]:
aug = naw.SplitAug()
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I am a programmer . We do not worry abo ut w arnings . We o nly worry a bout er rors .


### TF-IDF Augmenter
概要：tf-idf類似度に基づいたデータ拡張を行う<br>
動作：挿入、置換から選択可能<br>

参考：https://ja.wikipedia.org/wiki/Tf-idf<br>
tf-idfは、文書中に含まれる単語の重要度を評価する手法の1つであり、<br>主に情報検索やトピック分析などの分野で用いられている。

In [0]:
# tfidfaug_w2idf.txt作成
import sklearn.datasets
import re

import nlpaug.augmenter.word as naw
import nlpaug.model.word_stats as nmw

def _tokenizer(text, token_pattern=r"(?u)\b\w\w+\b"):
    token_pattern = re.compile(token_pattern)
    return token_pattern.findall(text)

train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
train_x = train_data.data

train_x_tokens = [_tokenizer(x) for x in train_x]

# Train TF-IDF model
tfidf_model = nmw.TfIdf()
tfidf_model.train(train_x_tokens)
tfidf_model.save('.')

In [26]:
aug = naw.TfIdfAug(model_path='./', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)



In [27]:
aug = naw.TfIdfAug(model_path='./', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I Line a programmer . We do protruding _PQM about erlangen . We only worry fatigue errors .


### Word Embeddings Augmenter
概要：word2vec、GloVe、またはfasttext用いて、データ拡張を行う<br>
動作：挿入、置換から選択可能

GoogleのWord2Vec(GoogleNews-vectors-negative300.bin(約1.5GBあるため注意))を<br>以下リンクからダウンロードして任意の場所に格納する。<br>
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

In [0]:
aug = naw.WordEmbsAug(model_type='word2vec', model_path='./GoogleNews-vectors-negative300.bin', action="insert")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

In [0]:
aug = naw.WordEmbsAug(model_type='word2vec', model_path='./GoogleNews-vectors-negative300.bin', action="substitute")
augmented_text = aug.augment(text)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

# Sentence Augmentation
文単位でのデータ拡張

### Contextual Word Embeddings for Sentence Augmenter
概要：XLNet、GPT2、またはDistilGPT2予測に従って文を挿入する<br>
機能：挿入


In [29]:
# XLNet
aug = nas.ContextualWordEmbsForSentenceAug(model_path='xlnet-base-cased')
augmented_texts = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I Line a programmer . We do protruding _PQM about erlangen . We only worry fatigue errors .


In [30]:
# GPT2
aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2')
augmented_texts = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I Line a programmer . We do protruding _PQM about erlangen . We only worry fatigue errors .


In [31]:
# DistilGPT2
aug = nas.ContextualWordEmbsForSentenceAug(model_path='distilgpt2')
augmented_texts = aug.augment(text, n=1)
print("Original      :" + text)
print("Augmented Text:" + augmented_text)

Augmented Text:I Line a programmer . We do protruding _PQM about erlangen . We only worry fatigue errors .


# Contextual Word Embeddings Augmenter (Multilingual BERT)
BELT(多言語版)を用いた単語データ拡張。

In [40]:
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.1)
text_ja = '僕はプログラマーだからね。警告は気にしないんだ。気にするのはエラーだけだよ。'
augmented_text = aug.augment(text_ja)
print("Original      :" + text_ja)
print("Augmented Text:" + augmented_text)

Original      :僕はプログラマーだからね。警告は気にしないんだ。気にするのはエラーだけだよ。
Augmented Text:僕 は 長 。 警 告 は 気 にしないんた 。 気 にするのはエラーたけたよ 。


# Word Embeddings Augmenter (fasttext, Japanese)
Fasttextを用いた単語埋め込み表現でのデータ拡張（日本語版）

In [0]:
# https://github.com/taishi-i/nagisa
!pip install nagisa

In [0]:
# fastTextから学習済みモデルを取得する
# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
!gunzip cc.ja.300.vec.gz

In [0]:
# cc.ja.300.vecの容量が大きいためRAMが不足する可能性あり
import nagisa

def tokenizer(x):
    return nagisa.tagging(text).words

text_ja = '僕はプログラマーだからね。警告は気にしないんだ。気にするのはエラーだけだよ。'
aug = naw.WordEmbsAug(model_type='fasttext', tokenizer=tokenizer, model_path='cc.ja.300.vec')
augmented_text = aug.augment(text_ja)
print("Original      :" + text_ja)
print("Augmented Text:" + augmented_text)