# Extract general Malaya entities

<div class="alert alert-info">

This tutorial is available as an IPython notebook at [Malaya/example/general-malaya-entities](https://github.com/huseinzol05/Malaya/tree/master/example/general-malaya-entities).
    
</div>

<div class="alert alert-warning">

This module only use Regex to extract entities.
    
</div>

In [1]:
import logging

logging.basicConfig(level=logging.INFO)

In [2]:
%%time
import malaya

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmppjdv8tfx
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmppjdv8tfx/_remote_module_non_scriptable.py


CPU times: user 2.91 s, sys: 3.7 s, total: 6.61 s
Wall time: 2.13 s


  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


## Load general Malaya entity model

This model able to classify,

1. date
2. money
3. temperature
4. distance
5. volume
6. duration
7. phone
8. email
9. url
10. time
11. datetime
12. local and generic foods, can check available rules in `malaya.texts.entity.food`
13. local and generic drinks, can check available rules in `malaya.texts.entity.food`

We can insert BERT or any deep learning model by passing `malaya.entity.general_entity(model = model)`, as long the model has `predict` method and return `[(string, label), (string, label)]`. This is an optional.

In [3]:
entity = malaya.entity.general_entity()

### Examples

In [4]:
entity.predict('Husein baca buku Perlembagaan yang berharga 3k ringgit dekat kfc sungai petani minggu lepas, 2 ptg 2 oktober 2019 , suhu 32 celcius, sambil makan ayam goreng dan milo o ais')

{'date': {'2 oktober 2019': datetime.datetime(2019, 10, 2, 0, 0),
  'minggu lalu': datetime.datetime(2023, 10, 5, 15, 43, 46, 99837)},
 'money': {'3k ringgit': 'RM3000.0'},
 'temperature': ['32 celcius'],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'2 PM': datetime.datetime(2023, 10, 12, 14, 0)},
 'datetime': {'2 ptg 2 oktober 2019': datetime.datetime(2019, 10, 2, 14, 0)},
 'food': ['ayam goreng'],
 'drink': ['milo o ais'],
 'weight': []}

In [5]:
entity.predict('contact Husein at husein.zol05@gmail.com')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': ['husein.zol05@gmail.com'],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [6]:
entity.predict('tolong tempahkan meja makan makan nasi dagang dan jus apple, milo tarik esok dekat Restoran Sebulek')

{'date': {'esok': datetime.datetime(2023, 10, 13, 15, 43, 46, 144429)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': ['nasi dagang'],
 'drink': ['milo tarik', 'jus apple'],
 'weight': []}

### date

In [7]:
entity.predict('husein balik rumah pada 2/12/2022')

{'date': {'2/12/2022': datetime.datetime(2022, 2, 12, 0, 0)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [8]:
entity.predict('husein balik rumah pada 2 jan 2022')

{'date': {'2 jan 2022': datetime.datetime(2022, 1, 2, 0, 0)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [9]:
entity.predict('husein balik rumah pada 2022 mac 2')

{'date': {'2022 mac 2': datetime.datetime(2022, 3, 2, 0, 0)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### money

In [10]:
entity.predict('harga buku 2 ringgit')

{'date': {},
 'money': {'2 ringgit': 'RM2'},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [11]:
entity.predict('harga buku rm2.50 sen')

{'date': {},
 'money': {'rm2.50 ': 'RM2.50'},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [12]:
entity.predict('harga buku 5.34k ringgit')

{'date': {},
 'money': {'5.34k ringgit': 'RM5340.0'},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [13]:
entity.predict('harga buku 5.34m ringgit')

{'date': {},
 'money': {'5.34m ringgit': 'RM5340000.0'},
 'temperature': [],
 'distance': ['5.34m'],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [14]:
entity.predict('harga buku 5.34b ringgit')

{'date': {},
 'money': {'5.34b ringgit': 'RM5340000000.0'},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [15]:
entity.predict('harga buku rm 5.2')

{'date': {},
 'money': {'rm 5.2': 'RM5.2'},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### temperature

In [16]:
entity.predict('suhu harini 21.3c')

{'date': {},
 'money': {},
 'temperature': ['21.3c'],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [17]:
entity.predict('suhu harini 21.3    c')

{'date': {},
 'money': {},
 'temperature': ['21.3 c'],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### distance

In [18]:
entity.predict('sejauh 10 batu')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': ['10 batu'],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [19]:
entity.predict('sejauh 10.234    km')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': ['10.234 km'],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### volume

In [20]:
entity.predict('volume 21.2ml')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': ['21.2ml'],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### duration

In [21]:
entity.predict('duration 2jam')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': ['2jam'],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'2jam': datetime.datetime(2023, 10, 12, 13, 43, 49, 942979)},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [22]:
entity.predict('duration sejam')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': ['sejam'],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### phone

In [23]:
entity.predict('no telepon 013-1111111')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': ['013-1111111'],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### email

In [24]:
entity.predict('email at husein@email.com')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': ['husein@email.com'],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### URL

In [25]:
entity.predict('website di https://huseinhouse.com')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': ['https://huseinhouse.com'],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### Time

In [26]:
entity.predict('pada pkul 2')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'pukul 2': datetime.datetime(2023, 10, 2, 0, 0)},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [27]:
entity.predict('pada pkul 2.14')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'pukul 2.14': datetime.datetime(2023, 10, 12, 2, 14)},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

In [28]:
entity.predict('pada pkul 2:58:59')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'2:58:59': datetime.datetime(2023, 10, 12, 2, 58, 59),
  'pukul 2:58:59': datetime.datetime(2023, 10, 12, 2, 58, 59)},
 'datetime': {},
 'food': [],
 'drink': [],
 'weight': []}

### datetime

In [29]:
entity.predict('saya gerak 12/02/2022 14:23:21')

{'date': {'12/02/2022': datetime.datetime(2022, 12, 2, 0, 0)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'14:23:21': datetime.datetime(2023, 10, 12, 14, 23, 21)},
 'datetime': {'12/02/2022 14:23:21': datetime.datetime(2022, 12, 2, 14, 23, 21)},
 'food': [],
 'drink': [],
 'weight': []}

In [30]:
entity.predict('saya gerak 12/02/2022 2pm')

{'date': {'12/02/2022': datetime.datetime(2022, 12, 2, 0, 0)},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {'2pm': datetime.datetime(2023, 10, 12, 14, 0)},
 'datetime': {'12/02/2022 2pm': datetime.datetime(2022, 12, 2, 14, 0)},
 'food': [],
 'drink': [],
 'weight': []}

### local and generic foods

In [31]:
entity.predict('nasi goreng pattaya 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': ['nasi goreng'],
 'drink': [],
 'weight': []}

In [32]:
entity.predict('ayam penyet 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': ['ayam penyet'],
 'drink': [],
 'weight': []}

### local and generic drinks

In [33]:
entity.predict('teh o ais 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': ['teh o ais'],
 'weight': []}

In [34]:
entity.predict('teh ice 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': ['teh ice'],
 'weight': []}

In [35]:
entity.predict('nescafe beng 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': ['nescafe beng'],
 'weight': []}

In [36]:
entity.predict('jus rambutan 1')

{'date': {},
 'money': {},
 'temperature': [],
 'distance': [],
 'volume': [],
 'duration': [],
 'phone': [],
 'email': [],
 'url': [],
 'time': {},
 'datetime': {},
 'food': [],
 'drink': ['jus rambutan'],
 'weight': []}