

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples//colab/component_examples/multilingual/korean_ner_pos_and_tokenization.ipynb.ipynb)

 
 # Detect Named Entities (NER), Part of Speech Tags (POS) and Tokenize in Korean

In [None]:
import os
! apt-get update -qq > /dev/null   
# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! pip install nlu pyspark==2.4.7 > /dev/null   
import nlu


# Tokenize in Korean

In [None]:
# Tokenize in Korean
import nlu
# pipe = nlu.load('ko.tokenize')    This is an alias that gives you the same model

pipe = nlu.load('ko.segment_words')

# korean for 'Eunkyung and Goo don't share much opinion'

ko_data = ['Donald Trump와 Angela Merkel은 많은 의견을 공유하지 않습니다.']
df = pipe.predict(ko_data, output_level='token')
df

wordseg_kaist_ud download started this may take some time.
Approximate size to download 738.9 KB
[OK!]


Unnamed: 0_level_0,token
origin_index,Unnamed: 1_level_1
0,D
0,o
0,n
0,a
0,l
0,d
0,Trump와
0,A
0,n
0,g


# Extract Part of Speech in Korean

In [None]:
# Extract Part of Speech
pipe = nlu.load('ko.pos')
ko_data = ['은경과 구는 많은 의견을 공유하지 않는다']

df = pipe.predict(ko_data, output_level='document')
df

pos_ud_kaist download started this may take some time.
Approximate size to download 15.8 MB
[OK!]
wordseg_kaist_ud download started this may take some time.
Approximate size to download 738.9 KB
[OK!]


Unnamed: 0_level_0,pos,document
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[PROPN, NOUN, ADJ, NOUN, VERB, AUX, ADV]",은경과 구는 많은 의견을 공유하지 않는다


# Extract Named Entities in Korean

In [None]:
# Extract named korean entities 
pipe = nlu.load('ko.ner.kmou.glove_840B_300d')
ko_data = ['은경과 구는 많은 의견을 공유하지 않는다']

# Since NER requires POS, we can tell NLU to keep the POS results with metadata=True
df = pipe.predict(ko_data, output_level='document')
df

ner_kmou_glove_840B_300d download started this may take some time.
Approximate size to download 17.1 MB
[OK!]
glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]
wordseg_kaist_ud download started this may take some time.
Approximate size to download 738.9 KB
[OK!]


Unnamed: 0_level_0,entities,sentence,word_embeddings,ner_confidence,token,entities_confidence,ner,document
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,[다],[은경과 구는 많은 의견을 공유하지 않는다],"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[1.0, 1.0, 0.9998000264167786, 1.0, 1.0, 0.999...","[은, 경과구는, 많은, 의견을, 공유하지, 않는, 다]",[PS],O,은경과 구는 많은 의견을 공유하지 않는다


# Translate Japanese extracted named entities to English

In [None]:
translate_pipe = nlu.load('ko.translate_to.en')
en_entities = translate_pipe.predict(df.entities.str.join('.').values.tolist())
en_entities

translate_ko_en download started this may take some time.
Approx size to download 397 MB
[OK!]


Unnamed: 0_level_0,translation,sentence
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,All of,다
