

![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/nlu/blob/master/examples//colab/component_examples/multilingual/chinese_ner_pos_and_tokenization.ipynb.ipynb)

 
 # Detect Named Entities (NER), Part of Speech Tags (POS) and Tokenize in Chinese



# Install NLU

In [None]:
import os
! apt-get update -qq > /dev/null   
# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! pip install nlu pyspark==2.4.7 > /dev/null   

import nlu


# Tokenize Chinese

In [None]:
# Tokenize in chinese
import nlu
# pipe = nlu.load('zh.tokenize')    This is an alias that gives you the same model

pipe = nlu.load('zh.segment_words')

# Chinese for 'Donald Trump and Angela Merkel dont share many opinions'

zh_data = ['唐纳德特朗普和安吉拉·默克尔没有太多意见']
df = pipe.predict(zh_data, output_level='token')
df

wordseg_weibo download started this may take some time.
Approximate size to download 1.2 MB
[OK!]


Unnamed: 0_level_0,token
origin_index,Unnamed: 1_level_1
0,唐纳特
0,德
0,朗
0,普
0,和
0,安吉拉
0,·
0,默
0,克
0,尔


# Extract Chinese POS

In [None]:
# Extract Part of Speech
pipe = nlu.load('zh.pos')
zh_data = ['唐纳德特朗普和安吉拉·默克尔没有太多意见']

df = pipe.predict(zh_data, output_level='document')
df

pos_ud_gsd download started this may take some time.
Approximate size to download 3.4 MB
[OK!]
wordseg_weibo download started this may take some time.
Approximate size to download 1.2 MB
[OK!]


Unnamed: 0_level_0,pos,document
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[PROPN, PROPN, PROPN, NOUN, CONJ, PROPN, PUNCT...",唐纳德特朗普和安吉拉·默克尔没有太多意见


# Extract Chinese NER

In [None]:
# Extract named chinese entities
pipe = nlu.load('zh.ner')
zh_data = ['唐纳德特朗普和安吉拉·默克尔没有太多意见']
df = pipe.predict(zh_data, output_level='document')
df

ner_msra_bert_768d download started this may take some time.
Approximate size to download 19.2 MB
[OK!]
wordseg_weibo download started this may take some time.
Approximate size to download 1.2 MB
[OK!]
bert_base_chinese download started this may take some time.
Approximate size to download 367.6 MB
[OK!]


Unnamed: 0_level_0,word_embeddings,entities_confidence,entities,document,ner_confidence
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[[-0.04935752600431442, -0.47514036297798157, ...","[R, R]","[唐纳德, 安吉拉]",唐纳德特朗普和安吉拉·默克尔没有太多意见,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


# Translate Chinese extracted named entities to English

In [None]:
# Translate Chinese extracted named entities to English
translate_pipe = nlu.load('zh.translate_to.en')
en_entities = translate_pipe.predict(df.entities.str.join('.').values.tolist())
en_entities

translate_zh_en download started this may take some time.
Approx size to download 396.8 MB
[OK!]


Unnamed: 0_level_0,translation,sentence
origin_index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Donald.,唐纳德.
0,Angela.,安吉拉
