In [58]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

In [59]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [2]:
#Set the console bigger
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [3]:
#Import training dataset (df)
train_set = pd.read_csv('C:/Users/pmh1407/Downloads/Shopee/train.csv',index_col=False)

#Import testing dataset or unlabelled dataset (test)
test_set = pd.read_csv('C:/Users/pmh1407/Downloads/Shopee/test.csv')

# Copy datasets
train_df= train_set.copy(deep=True)
test_df=test_set.copy(deep=True)

In [4]:
train_df.head(100)

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 rawalumbu",/kem mel raya
7,7,tela keuramat kuta alam,/tela
8,8,gg. i wates magersari,/gg. i
9,9,bunga ncole ix 2,/bunga ncole ix


In [5]:
train_df.tail(100)

Unnamed: 0,id,raw_address,POI/street
299900,299900,cluster arcadia village jl.boulevard raya gading serpong klp.dua no.17,arcadia village/jl.boulevard raya gading serpong
299901,299901,"pt. lasarez dina, mas al mujtaba duren sawit",pt. lasarez dinamika/mas al mujtaba
299902,299902,"genteng gent sido, no 2 rt 2 6 60275 genteng",/gent sido
299903,299903,inti 67 babura sunggal,/inti
299904,299904,helvetia timur pem 94 medan helvetia,/pem
299905,299905,"toko karya tani, multa, no 83 42311",toko karya tani/multa
299906,299906,"taman makam pahlawan nasi maje,",taman makam pahlawan nasional majeluk/
299907,299907,"dr. sut, no 40 firman, sukoharjo",firman/dr. sut
299908,299908,merdeka ling jati 113 21136 siantar timur,/ling jati
299909,299909,"esem pulsa, setia luhur 96 dwi kora",/setia luhur


In [6]:
test_df.head(100)

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per, baloi indah kel. lubuk baja"
2,2,"asma laun, mand imog,"
3,3,"ud agung rej, raya nga sri wedari karanganyar"
4,4,"cut mutia, 35 baiturrahman"
5,5,pem dos dapur ala perum gar no a 12 suka jaya sukarami
6,6,"tb. mara, cisayong"
7,7,"pura taman beji tista,"
8,8,"mangkura tk pam, 90113 ujung pandang"
9,9,raya won wonotunggal wonotunggal


In [7]:
test_df.tail(100)

Unnamed: 0,id,raw_address
49900,49900,setia i jaticempaka pondok gede
49901,49901,wanam 11 88 rt 4 3 15157 karang tengah
49902,49902,"rambutan jl.t merd, 28 rt 8 6 13750 ciracas"
49903,49903,"dr. sut kedai kopi nusantar, kali rejo"
49904,49904,"ketua rt 13, ngeni wonotirto"
49905,49905,graha maha no 75 tanjung pagar kel.
49906,49906,"pmi kab. bojone, truno, kepatihan bojonegoro"
49907,49907,"mil peng,"
49908,49908,"canda kaos polos dan sab par bingung, no 74 pancoran mas"
49909,49909,"akademi angkatan laut, bumi moro surabaya"


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           50000 non-null  int64 
 1   raw_address  50000 non-null  object
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           300000 non-null  int64 
 1   raw_address  300000 non-null  object
 2   POI/street   300000 non-null  object
dtypes: int64(1), object(2)
memory usage: 6.9+ MB


In [10]:
train_df['POI'] = train_df['POI/street'].str.split('/', n=2, expand=True)[0]
train_df['street'] = train_df['POI/street'].str.split('/', n=2, expand=True)[1]
train_df['POI/street'].replace('/', np.nan, inplace=True)
# train_df['POI'].replace('', np.nan, inplace=True)
# train_df['street'].replace('', np.nan, inplace=True)
# train_df['POI'] = train_df['POI'].str.lower()
# train_df['street'] = train_df['street'].str.lower()

# # Remove everything except words and space
# df_cleaned['POI'] = list(map(lambda x: re.sub("'s", " ", x), df_cleaned['POI']))
# df_cleaned['POI'] = list(map(lambda x: re.sub("[^\w\s]", " ", x), df_cleaned['POI']))

# df_cleaned['street'] = list(map(lambda x: re.sub("'s", " ", x), df_cleaned['street']))
# df_cleaned['street'] = list(map(lambda x: re.sub("[^\w\s]", " ", x), df_cleaned['street']))

# # Remove words containing numbers
# df_cleaned['POI'] = list(map(lambda x: re.sub('\w*\d\w*','', x), df_cleaned['POI']))
# df_cleaned['street'] = list(map(lambda x: re.sub('\w*\d\w*','', x), df_cleaned['street']))

# # Remove unnecessary white space
# df_cleaned['POI'] = list(map(lambda x: re.sub(" +", " ", x.strip()), df_cleaned['POI']))
# df_cleaned['street'] = list(map(lambda x: re.sub(" +", " ", x.strip()), df_cleaned['street']))

# df_cleaned['POI'] = list(map(lambda x: re.sub("[\[\]\\0-9()\"$#%/@;:<>{}`+=~|.!?,-]", " ",x), df_cleaned['POI']))
# df_cleaned['street'] = list(map(lambda x: re.sub("[\[\]\\0-9()\"$#%/@;:<>{}`+=~|.!?,-]", " ",x), df_cleaned['street']))

def clean_words(address):
    address = str(address).lower()
    address = re.sub(r"[\[\]\\0-9()\"$#%/@;:<>{}`+=~|.!?,-]", "", address)
    address = re.sub(r"&", "", address)
    address = re.sub(r"'s", "", address)
    address = re.sub(r"[^\w\s]", "", address)
    address = re.sub(r"\w*\d\w*", "", address)
    address = address.strip()
    return address

In [11]:
train_df['POI'] = train_df['POI'].apply(lambda x:clean_words(x))
train_df['street'] = train_df['street'].apply(lambda x:clean_words(x))

In [12]:
train_df_clean = train_df.drop_duplicates(subset=["raw_address",'POI/street'])
print(train_df.shape)

(300000, 5)


In [13]:
def duplicated_values_data(data):
    dup=[]
    columns=data.columns
    for i in data.columns:
        dup.append(sum(data[i].duplicated()))
    return pd.concat([pd.Series(columns),pd.Series(dup)], axis=1, keys =["Columns", "Duplicate count"])

In [14]:
df_cleaned = train_df_clean.dropna()
duplicated_values_data(df_cleaned)

Unnamed: 0,Columns,Duplicate count
0,id,0
1,raw_address,0
2,POI/street,91049
3,POI,175422
4,street,182225


In [15]:
# def missing_value_of_data(data):
#     total = data.isnull().sum().sort_values(ascending=False)
#     percentage = round(total/data.shape[0]*100,2)
#     return pd.concat([total,percentage], axis=1, keys=['Total','Percentage'])

In [16]:
# missing_value_of_data(df_cleaned)

In [17]:
print(len(df_cleaned['POI'].unique()))
print(len(df_cleaned['street'].unique()))

92585
85782


In [18]:
df_cleaned

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede
...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 rt02 03 no 57.,taman asri/,taman asri,


In [21]:
import random
import torch
import torch.optim as optim

from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x256f389d470>

In [22]:
model_name='cahya/bert-base-indonesian-522M'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
model = BertModel.from_pretrained(model_name)
text = "Silakan diganti dengan text apa saja."
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[    3, 23629,  1507,  6407,  1555, 16376,  3397,  2957,    17,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [23]:
tokenizer.convert_tokens_to_ids(["[UNK]", "[CLS]", "ksduhvigawrhu", "[SEP]", "."])

[0, 3, 0, 1, 17]

In [24]:
encoded_input

{'input_ids': tensor([[    3, 23629,  1507,  6407,  1555, 16376,  3397,  2957,    17,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [25]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [30]:
class BERTTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
  
        #text = [sent len, batch size]
    
        text = text.permute(1, 0)
        
        #text = [batch size, sent len]
        
        embedded = self.dropout(self.bert(text)[0])
        
        #embedded = [batch size, seq len, emb dim]
                
        embedded = embedded.permute(1, 0, 2)
                    
        #embedded = [sent len, batch size, emb dim]
        
        predictions = self.fc(self.dropout(embedded))
        
        #predictions = [sent len, batch size, output dim]
        
        return predictions

In [31]:
bert_poi = BERTTagger(model, 2, 0.2)
bert_street = BERTTagger(model, 2, 0.2)
bert_poi

BERTTagger(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
 

In [32]:
for param in bert_poi.bert.parameters():
    param.requires_grad = False
    
for param in bert_street.bert.parameters():
    param.requires_grad = False

In [33]:
encoded_input["input_ids"]

tensor([[    3, 23629,  1507,  6407,  1555, 16376,  3397,  2957,    17,     1]])

In [34]:
poi_output = bert_poi(encoded_input["input_ids"]).max(axis=2)[1][:, 1:-1]
street_output = bert_street(encoded_input["input_ids"]).max(axis=2)[1][:, 1:-1]
poi_output

tensor([[0, 1, 0, 0, 1, 0, 1, 1]])

In [35]:
token_ids = torch.masked_select(encoded_input["input_ids"][:, 1:-1], poi_output.bool())
tokens = tokenizer.convert_ids_to_tokens(token_ids)
poi_sentence = tokenizer.convert_tokens_to_string(tokens)

token_ids = torch.masked_select(encoded_input["input_ids"][:, 1:-1], street_output.bool())
tokens = tokenizer.convert_ids_to_tokens(token_ids)
street_sentence = tokenizer.convert_tokens_to_string(tokens)

print(poi_sentence + "/" + street_sentence)

##kan text saja ./text apa


In [36]:
text = ["aaa bbb", "cccaaa asdas ddd"]
encoded_input = tokenizer(text, return_tensors='pt', padding=True)
encoded_input

{'input_ids': tensor([[    3, 15905,  1007, 11003,  1013,     1,     2,     2,     2],
        [    3,  8638,  2586, 25457,  1635,  8559, 21920,  1006,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [37]:
text = ["[UNK] [SEP] [MASK] [PAD] [CLS]", "cccaaa asdas ddd"]
encoded_input = tokenizer(text, return_tensors='pt', padding=True)
encoded_input

{'input_ids': tensor([[    3,     0,     1,     4,     2,     3,     1,     2,     2],
        [    3,  8638,  2586, 25457,  1635,  8559, 21920,  1006,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [38]:
bert_street(encoded_input["input_ids"]).max(dim=2)[1][:, 1:-1]

tensor([[0, 0, 1, 1, 1, 0, 1],
        [0, 1, 0, 0, 1, 0, 0]])

In [39]:
def subfinder(mylist, pattern):
    
    if len(mylist) == 0 or len(pattern) == 0:
        return -1
    
    start_idx = 0
    j = 0
    
    for i, _ in enumerate(mylist):

        if mylist[i] == pattern[j]:
            
            if j == 0:
                start_idx = i
            
            j += 1
            
        else:
            
            j = 0
        
        if j == len(pattern):
            return start_idx
        
    return -1

In [40]:
print(tokenizer(["siung"]))
print(tokenizer.convert_tokens_to_ids(["siung"]))
print(tokenizer.convert_ids_to_tokens([   3, 2139, 1563,    1]))

{'input_ids': [[3, 2139, 1563, 1]], 'token_type_ids': [[0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1]]}
[0]
['[CLS]', 'si', '##ung', '[SEP]']


In [41]:
def get_start_positions(row):
    
    raw_address_token_ids = tokenizer(row.raw_address, return_tensors='pt')['input_ids'][:, 1:-1].tolist()[0]
    poi_token_ids = tokenizer(row.POI, return_tensors='pt')['input_ids'][:, 1:-1].tolist()[0]
    street_token_ids = tokenizer(row.street, return_tensors='pt')['input_ids'][:, 1:-1].tolist()[0]
    poi_len = len(poi_token_ids)
    street_len = len(street_token_ids)
    start_poi_position = subfinder(raw_address_token_ids, poi_token_ids)
    start_street_position = subfinder(raw_address_token_ids, street_token_ids)
    
    return start_poi_position, poi_len, start_street_position, street_len
    
df_cleaned['start_POI_position'], df_cleaned['poi_len'], df_cleaned['start_street_position'], df_cleaned['street_len'] = zip(*df_cleaned.apply(get_start_positions, axis=1))
df_cleaned

Unnamed: 0,id,raw_address,POI/street,POI,street,start_POI_position,poi_len,start_street_position,street_len
0,0,jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,-1,0,0,10
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,-1,0,2,2
3,3,"toko dita, kertosono",toko dita/,toko dita,,0,2,-1,0
4,4,jl. orde baru,/jl. orde baru,,jl orde baru,-1,0,-1,3
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,6,3,0,3
...,...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,-1,0,4,2
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,-1,0,0,3
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko,-1,0,0,4
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 rt02 03 no 57.,taman asri/,taman asri,,4,2,-1,0


In [47]:
train_encodings = tokenizer(list(df_cleaned.raw_address.values), return_tensors='pt', padding=True)
train_encodings

{'input_ids': tensor([[   3, 6318, 2089,  ...,    2,    2,    2],
        [   3, 1673, 1005,  ...,    2,    2,    2],
        [   3, 5282, 3746,  ...,    2,    2,    2],
        ...,
        [   3, 2980, 7380,  ...,    2,    2,    2],
        [   3, 2384, 7479,  ...,    2,    2,    2],
        [   3, 5283, 1697,  ...,    2,    2,    2]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [43]:
class AddressExtractionDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, encoding, df, target_type):
        
        self.encoding = encoding
        self.df = df
        self.target_type = target_type

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        
        
        input_ids = self.encoding['input_ids'][idx]
        if self.target_type == "POI":
            target = "POI"
            start_position = self.df['start_POI_position'].iloc[idx]
            target_len = self.df['poi_len'].iloc[idx]
            
            #target_text = self.df['POI']
        elif self.target_type == "street":
            target = "street"
            start_position = self.df['start_street_position'].iloc[idx]
            target_len = self.df['street_len'].iloc[idx]
            
            #target_text = self.df['street']
        elif self.target_type == "test":
            return input_ids
        else:
            raise NotImplementedError
        
        labels = torch.zeros_like(input_ids)
        labels[start_position + 1:start_position + 1 + target_len] = 1

        return input_ids, labels

In [51]:
aed_poi = AddressExtractionDataset(train_encodings, df_cleaned, "POI")
aed_street = AddressExtractionDataset(train_encodings, df_cleaned, "street")
num_example = 50

count = 0
for input_ids, labels in aed_poi:
    count += 1

#     print(input_ids, labels)
    print(tokenizer.decode(input_ids.masked_select((labels > 0).bool())))
    if count > num_example:
        break
        
print("----------------")        
        
count = 0
for input_ids, labels in aed_street:
    count += 1

#     print(input_ids, labels)
    print(tokenizer.decode(input_ids.masked_select((labels > 0).bool())))
    if count > num_example:
        break



toko dita

toko bb kids




[CLS] cikahu
[CLS] yaya at


gudang areng

pangkalan lareh


[CLS] toko bang aj
stadion kobelete




[CLS] jln. tirta



komplek borneo lestari

perumahan bayur sarana indah






[CLS] mar tabl metro

[CLS] sd neg



[CLS] rumah makan

bsd city
fajar motor


[CLS] samping kiri

----------------
jl kapuk timur delta sili iii lippo cika
siung

[CLS] jl.
raya samb gede
kem mel raya
tela
[CLS] gg
bunga ncole ix
klap boj

abim ix
gang xiii

maru baru

pekap raya
tam tama barat v


[CLS] tam asri,
ramb i
bumi raya
seta
[CLS] jln. tirta
sadar i
[CLS] meruya selatan g
nogos
[CLS] jl
raya sege

[CLS] gg
[CLS] sal gg
kamp rawa bebek
selam vii
lin liwa
jln raya solok pad

roos timur ii

mulia
[CLS] gg
raya cira
raya jomb
raya babe
[CLS] pt. hugo
raya han
mah iv
mas ii

raya sorea


In [45]:
dataloader_poi = DataLoader(aed_poi, batch_size=16,
                        shuffle=True, num_workers=0)
dataloader_street = DataLoader(aed_street, batch_size=16,
                        shuffle=True, num_workers=0)

In [46]:
for inputs, labels in dataloader_poi:
    print(inputs, labels)
    break

tensor([[    3,  2676, 22964,     1,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2],
        [    3,  6318,    17,  6024, 11182,  1014,  2845,    17,  6564,    11,
          2636,  2384,  2332,  2569,  3589,  6024, 11182,  1014,    12,     1,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2],
        [    3,  8646,  8386, 13010,  6596,  5417,  1590, 30146,  7100, 14858,
             1,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
             2,     2,     2,     2,     2,     2,     

In [52]:
LEARNING_RATE = 5e-5
optimizer_poi = optim.Adam(bert_poi.parameters(), lr = LEARNING_RATE)
optimizer_street = optim.Adam(bert_street.parameters(), lr = LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index = 2)

In [None]:
def train(model, dataloader, optimizer, criterion):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    runnning_loss = 0
    interval = 100
    epoch_loss = 0
    #epoch_acc = 0
    
    model.train()
    
    for idx, (input_ids, labels) in enumerate(tqdm(dataloader, position=0, leave=True)):
                  
        input_ids = input_ids.to(device)
        labels = labels.to(device)
            
        optimizer.zero_grad()   
        
        predictions = model(input_ids)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        
        loss = criterion(predictions, labels)
        loss.backward()
        
        optimizer.step()
        
        runnning_loss += loss.item()
        
        if idx % interval == interval - 1:
            print("running_loss: {}".format(runnning_loss / interval))
            runnning_loss = 0
        
        epoch_loss += loss.item()
        #epoch_acc += acc.item()
        
    return epoch_loss / len(dataloader)#, epoch_acc / len(dataloader)

In [None]:
NUM_EPOCHS = 3

poi_losses = []
street_losses = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_poi = bert_poi.to(device)
bert_street = bert_street.to(device)

for epoch in range(NUM_EPOCHS):
    poi_loss = train(bert_poi, dataloader_poi, optimizer_poi, criterion)
    street_loss = train(bert_street, dataloader_street, optimizer_street, criterion)
    
    poi_losses.append(poi_loss)
    street_losses.append(street_loss)
    
    torch.save(bert_poi.state_dict(), "bert_poi_{}.pth".format(str(epoch)))
    torch.save(bert_street.state_dict(), "bert_street_{}.pth".format(str(epoch)))

In [None]:
train_encodings['input_ids'].shape[1]

In [None]:
test_encodings = tokenizer(list(test_df.raw_address.values), return_tensors='pt', padding=True, truncation=True, max_length=train_encodings['input_ids'].shape[1])

In [None]:
test_encodings

In [None]:
aed_test = AddressExtractionDataset(test_encodings, test_df, 'test')
dataloader_test = DataLoader(aed_test, batch_size=1, num_workers=4)

In [None]:
def clean_answer(answer):
    answer = answer.replace("[SEP]", "")
    answer = answer.replace("[UNK]", "")
    answer = answer.replace("[PAD]", "")
    answer = answer.replace("  ", " ")
    return answer.strip()