

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples//colab/component_examples/multilingual/japanese_ner_pos_and_tokenization.ipynb.ipynb)

 
 # Detect Named Entities (NER), Part of Speech Tags (POS) and Tokenize in Japanese


In [None]:
import os
! apt-get update -qq > /dev/null   
# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! pip install nlu pyspark==2.4.7 > /dev/null   
import nlu


Tokenize Japanese

In [None]:
# Tokenize in japanese
import nlu
# pipe = nlu.load('ja.tokenize')    This is an alias that gives you the same model

pipe = nlu.load('ja.segment_words')
# japanese for 'Donald Trump and Angela Merkel dont share many opinions'
ja_data = ['ドナルド・トランプとアンゲラ・メルケルは多くの意見を共有していません']
df = pipe.predict(ja_data, output_level='token')
df

wordseg_gsd_ud download started this may take some time.
Approximate size to download 1 MB
[OK!]


Unnamed: 0_level_0,token
origin_index,Unnamed: 1_level_1
0,ドナルド
0,・
0,トランプ
0,と
0,アンゲラ
0,・
0,メルケル
0,は
0,多く
0,の


# Extract Part of Speech in Japanese


In [None]:
# Extract Part of Speech
pipe = nlu.load('ja.pos')
ja_data = ['ドナルド・トランプとアンゲラ・メルケルは多くの意見を共有していません']

df = pipe.predict(ja_data, output_level='document')
df

pos_ud_gsd download started this may take some time.
Approximate size to download 2.6 MB
[OK!]
wordseg_gsd_ud download started this may take some time.
Approximate size to download 1 MB
[OK!]


Unnamed: 0_level_0,document,pos
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ドナルド・トランプとアンゲラ・メルケルは多くの意見を共有していません,"[PROPN, SYM, PROPN, ADP, PROPN, SYM, NOUN, ADP..."


# Extract named japanese entities


In [None]:
# Extract named japanese entities
pipe = nlu.load('ja.ner')
ja_data = ['ドナルド・トランプとアンゲラ・メルケルは多くの意見を共有していません']
df = pipe.predict(ja_data, output_level='document')
df

ner_ud_gsd_glove_840B_300d download started this may take some time.
Approximate size to download 19.2 MB
[OK!]
wordseg_gsd_ud download started this may take some time.
Approximate size to download 1 MB
[OK!]
glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]


Unnamed: 0_level_0,word_embeddings,entities,ner_confidence,entities_confidence,document
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[トランプ, アンゲラ, メルケル]","[0.9988999962806702, 1.0, 0.9959999918937683, ...","[G, RSON, RSON]",ドナルド・トランプとアンゲラ・メルケルは多くの意見を共有していません


# Translate Japanese extracted named entities to English

In [None]:
# Translate Japanese extracted named entities to English
translate_pipe = nlu.load('ja.translate_to.en')
en_entities = translate_pipe.predict(df.entities.str.join('.').values.tolist())
en_entities

translate_ja_en download started this may take some time.
Approx size to download 380.5 MB
[OK!]


Unnamed: 0_level_0,sentence,translation
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,トランプ.,Cards.
0,アンゲラ.,Angelia.
0,メルケル,Merkel.
