## 安裝套件

In [None]:
!pip install pandarallel
!pip install --upgrade spacy
!pip install pyahocorasick

Collecting pandarallel
  Downloading https://files.pythonhosted.org/packages/f9/c9/2350222cec65593ab5f2f00f2e57dfd1fa4e697dbe92fcaff641485354e6/pandarallel-1.5.2.tar.gz
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.5.2-cp37-none-any.whl size=18386 sha256=8ae7facdea1d6ef470ce9f3c2ca59d84acf16248c9534e76bbf5d68562fabeaf
  Stored in directory: /root/.cache/pip/wheels/40/80/6d/d50fb72a8ce6a923fb10390fec9eaaa40b02d07a7ec05c9c05
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.5.2
Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/3a/70/a0b8bd0cb54d8739ba4d6fb3458785c3b9b812b7fbe93b0f10beb1a53ada/spacy-3.0.5-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 320kB/s 
[?25hCollecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://fil

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from pandarallel import pandarallel
pandarallel.initialize()

import spacy
# spacy.require_gpu()
from spacy.training import Example
from spacy.util import minibatch
import random

import ahocorasick
from google.colab import drive
drive.mount('/content/drive')

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Mounted at /content/drive


## 讀取資料

In [None]:
df = pd.read_csv('/content/drive/MyDrive/shopee21/train.csv')
df.set_index("id", inplace=True)
df['POI'] = np.nan
df['street'] = np.nan

def extract_entities(row):
    extracted = row['POI/street'].split("/")
    
    if len(extracted) == 2:
        poi, street = extracted
        if poi.strip() != '':
            row['POI'] = poi
        
        if street.strip() != '':
            row['street'] = street
        
    return row

df = df.parallel_apply(extract_entities, axis=1)
nlp = spacy.blank('id')  # create blank Language class

In [None]:
df

Unnamed: 0_level_0,raw_address,POI/street,POI,street
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,"aye, jati sampurna",/,,
2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,"toko dita, kertosono",toko dita/,toko dita,
4,jl. orde baru,/jl. orde baru,,jl. orde baru
...,...,...,...,...
299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani
299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko
299997,tanjung gusta jl. yaya 2 no 17,/,,
299998,jalan cipadu jaya taman asri gang bijaksana 3 rt02 03 no 57.,taman asri/,taman asri,


## 處理資料

In [None]:
from copy import deepcopy

def _build_aho(words):
    aho = ahocorasick.Automaton()
    for idx, key in enumera|te(words):
        aho.add_word(key, (idx, key))

    return aho

## 格式化data
def format_data(text, poi, street):
    entities = []
    _text = deepcopy(text)
    
    ## 處理poi
    if isinstance(poi, str):
        aho = _build_aho([poi])
        aho.make_automaton()
        latest_char_idx = 0
        
        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'POI'))
            _text = _text.replace(word, " " * len(word))
            latest_char_idx = end + 1
    
    ## 處理street
    if isinstance(street, str):
        aho = _build_aho([street])
        aho.make_automaton()
        latest_char_idx = 0

        for end, (_, word) in aho.iter(_text):
            start = end - len(word) + 1
            if start < latest_char_idx:
                continue

            entities.append((start, end + 1, 'STREET'))
            latest_char_idx = end + 1
    
    return Example.from_dict(nlp.make_doc(text), {"entities": entities})

In [None]:
print("Preparing Spacy examples...")

examples = []
for idx in df.index:
    try:
        row = df.loc[idx]
        example = format_data(row['raw_address'], row['POI'], row['street'])
        examples.append(example)
    except Exception as e:
        print(idx)
        print("-" * 50)
        print(e)
        break

Preparing Spacy examples...
0
--------------------------------------------------
name 'enumera' is not defined


## 訓練

In [None]:
def train_spacy(nlp, examples, iterations):
    
    TRAIN_DATA = examples
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.add_pipe('ner', last=True)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(examples)
            batches = minibatch(examples, 1000)
            losses = {}
            for batch in batches:
                nlp.update(
                    batch,
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [None]:
train = examples

In [None]:
%%time
ner_nlp = train_spacy(nlp, train, 30)

Starting iteration 0
{}
Starting iteration 1
{}
Starting iteration 2
{}
Starting iteration 3
{}
Starting iteration 4
{}
Starting iteration 5
{}
Starting iteration 6
{}
Starting iteration 7
{}
Starting iteration 8
{}
Starting iteration 9
{}
Starting iteration 10
{}
Starting iteration 11
{}
Starting iteration 12
{}
Starting iteration 13
{}
Starting iteration 14
{}
Starting iteration 15
{}
Starting iteration 16
{}
Starting iteration 17
{}
Starting iteration 18
{}
Starting iteration 19
{}
Starting iteration 20
{}
Starting iteration 21
{}
Starting iteration 22
{}
Starting iteration 23
{}
Starting iteration 24
{}
Starting iteration 25
{}
Starting iteration 26
{}
Starting iteration 27
{}
Starting iteration 28
{}
Starting iteration 29
{}
CPU times: user 49.1 ms, sys: 2.96 ms, total: 52 ms
Wall time: 57.1 ms


## 看預測結果

In [None]:
for idx, row in df.iloc[100:110].iterrows():
    print(f"address: {row['raw_address']}")
    print(f"expected poi: {row['POI']}")
    print(f"expected street: {row['street']}")
    print()
    
    doc = ner_nlp(row['raw_address'])
    for ent in doc.ents:
        print(ent.text, "-", ent.label_)

    print("-" * 50)

address: kedai tenun jep senn, kota bumi, kebon melati
expected poi: kedai tenun jepara sennaart
expected street: kota bumi

--------------------------------------------------
address: wadungasri dalam waru raya wad asri, 24 sidoarjo
expected poi: dalam waru
expected street: raya wad asri

--------------------------------------------------
address: bulusan tim barat iii, no 35 3 tembalang
expected poi: nan
expected street: tim barat iii

--------------------------------------------------
address: bakti jaya bukit perm vii 8 15315 setu
expected poi: nan
expected street: bukit perm vii

--------------------------------------------------
address: jl terusan buah batu no 185. samping indomaret. bandung.
expected poi: samping indomaret
expected street: jl terusan buah batu

--------------------------------------------------
address: setia indah, jati, no 12
expected poi: setia indah
expected street: jati

--------------------------------------------------
address: kepuhkiriman gg. bca 61256

## 將test資料預測

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/shopee21/test.csv")
df_test.set_index("id", inplace=True)

In [None]:
submission = []
for idx, row in df_test.iloc[:].iterrows():
    doc = ner_nlp(row['raw_address'])
    tmp = {'id': idx}
    for ent in doc.ents:
        tmp[ent.label_] = ent.text
    submission.append(tmp)
    


In [None]:
submission = pd.DataFrame(submission)
submission['POI'] = submission['POI'].replace(np.nan, '')
submission['STREET'] = submission['STREET'].replace(np.nan, '')
submission['POI/street'] = submission['POI'] + '/' + submission['STREET'] 

In [None]:
pd.DataFrame({'id':submission['id'],'POI/street':submission['POI/street']}).to_csv('submission1.csv', header=True, index=False)

In [None]:
ner_nlp.to_disk("/content/drive/MyDrive/shopee21/custom_ner_address3")