In [97]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 
from glob import glob

In [98]:
class config:
    DATASET = '/kaggle/input/vner-vlsp-2021/data/*'

# `Load and explore VNER dataset`

In [99]:
files = glob(config.DATASET)

In [100]:
names = ['Word', 'pos', 'phrase', 'NER', 'NER1', 'NER2']
df = pd.read_csv(files[0], sep='\t', quoting=3, names=names, skip_blank_lines=False)
df.head(10)

Unnamed: 0,Word,pos,phrase,NER,NER1,NER2
0,Mong ước,_,_,O,O,O
1,được,_,_,O,O,O
2,đến,_,_,O,O,O
3,trường,_,_,O,O,O
4,của,_,_,O,O,O
5,bé,_,_,O,O,O
6,gái,_,_,O,O,O
7,7,_,_,B-QUANTITY-AGE,O,O
8,tuổi,_,_,I-QUANTITY-AGE,O,O
9,,,,,,


In [101]:
concatenated = pd.DataFrame(columns=names)
for file in files:  
    df = pd.read_csv(file, sep='\t', quoting=3, names=names,  skip_blank_lines=False)
    concatenated = pd.concat([concatenated, df], ignore_index=True)

In [117]:
concatenated['NER'].unique()

array(['O', 'B-QUANTITY-AGE', 'I-QUANTITY-AGE', 'B-DATETIME',
       'I-DATETIME', 'B-ORGANIZATION', 'I-ORGANIZATION', 'B-LOCATION-GPE',
       'I-LOCATION-GPE', 'B-PERSON', 'B-ADDRESS', 'I-ADDRESS',
       'B-DATETIME-DURATION', 'I-DATETIME-DURATION', 'B-QUANTITY-NUM',
       'B-DATETIME-DATE', 'I-DATETIME-DATE', 'B-QUANTITY-ORD',
       'I-QUANTITY-ORD', 'B-PERSONTYPE', 'I-PERSON', 'I-PERSONTYPE',
       'B-QUANTITY-CUR', 'I-QUANTITY-CUR', 'B-EVENT-GAMESHOW',
       'I-EVENT-GAMESHOW', 'B-DATETIME-TIMERANGE', 'I-DATETIME-TIMERANGE',
       'B-PRODUCT', 'I-PRODUCT', 'B-MISCELLANEOUS', 'I-MISCELLANEOUS',
       'B-DATETIME-TIME', 'I-DATETIME-TIME', 'B-DATETIME-DATERANGE',
       'I-DATETIME-DATERANGE', 'B-QUANTITY-PER', 'I-QUANTITY-PER',
       'B-PHONENUMBER', 'I-PHONENUMBER', 'B-EMAIL', 'B-URL',
       'B-QUANTITY-DIM', 'I-QUANTITY-DIM', 'B-EVENT', 'I-EVENT',
       'B-LOCATION-STRUC', 'I-LOCATION-STRUC', 'B-QUANTITY', 'I-QUANTITY',
       'I-URL', 'B-LOCATION', 'I-LOCATION', 'I-QUAN

In [103]:
dataset = concatenated[['Word', 'NER']]

In [104]:
dataset.head()

Unnamed: 0,Word,NER
0,Mong ước,O
1,được,O
2,đến,O
3,trường,O
4,của,O


# `Preprocessing Data`

In [105]:
dataset.loc[:, 'Word'] = dataset['Word'].replace(np.nan, 'NAN')
dataset.loc[:, 'NER'] = dataset['NER'].replace(np.nan, 'NAN')
# dataframe['NER_nested'] = dataframe['NER_nested'].replace(np.nan, 'NAN')

In [106]:
dataset = dataset[dataset['NER'].str.isupper()] #NER phải là chuỗi in HOA
# dataframe = dataframe[dataframe['NER_nested'].str.isupper()]

In [107]:
uppercase_rows = dataset['Word'].notna() & dataset['Word'].str.isupper() & dataset['Word'] == 'NAN'
dataset = dataset[~uppercase_rows]
dataset.head(10)

Unnamed: 0,Word,NER
0,Mong ước,O
1,được,O
2,đến,O
3,trường,O
4,của,O
5,bé,O
6,gái,O
7,7,B-QUANTITY-AGE
8,tuổi,I-QUANTITY-AGE
9,NAN,NAN


In [108]:
dataset.shape

(1008205, 2)

In [109]:
def create_col_sentence(dataset: pd.DataFrame):
    sentence_number = 1
    sentence_column = []
    for index, row in dataset.iterrows():
        if row['Word'] == 'NAN':
            sentence_number += 1
        sentence_column.append(f"Sentence: {sentence_number}")
    
    dataset['Sentence #'] = sentence_column
    dataset = dataset[dataset['Word'] != 'NAN']
    
    dataset = dataset.dropna(axis=0).reset_index(drop = True)
    return dataset

In [110]:
dataset = create_col_sentence(dataset)
dataset.head(10)

Unnamed: 0,Word,NER,Sentence #
0,Mong ước,O,Sentence: 1
1,được,O,Sentence: 1
2,đến,O,Sentence: 1
3,trường,O,Sentence: 1
4,của,O,Sentence: 1
5,bé,O,Sentence: 1
6,gái,O,Sentence: 1
7,7,B-QUANTITY-AGE,Sentence: 1
8,tuổi,I-QUANTITY-AGE,Sentence: 1
9,Giờ,B-DATETIME,Sentence: 2


In [111]:
dataset.rename(columns = {'NER':'Tag'}, inplace = True)

In [112]:
dataset['Tag'].value_counts()

Tag
O                         833546
B-PERSON                   15409
I-ORGANIZATION             11552
B-ORGANIZATION              9981
B-LOCATION-GPE              8831
                           ...  
I                             16
B-LOCATION-GPE HCM             6
A                              1
B-LOCATION-GPE-GEO             1
B-ORGANIZATION NN&PTNT         1
Name: count, Length: 90, dtype: int64

In [113]:
type(dataset)

pandas.core.frame.DataFrame

# Dump into file

In [114]:
dataset.to_csv('/kaggle/working/processed_data.csv', index=False)

# Load file for test

In [115]:
dataframe = pd.read_csv('/kaggle/working/processed_data.csv')

In [116]:
dataframe.head(10)

Unnamed: 0,Word,Tag,Sentence #
0,Mong ước,O,Sentence: 1
1,được,O,Sentence: 1
2,đến,O,Sentence: 1
3,trường,O,Sentence: 1
4,của,O,Sentence: 1
5,bé,O,Sentence: 1
6,gái,O,Sentence: 1
7,7,B-QUANTITY-AGE,Sentence: 1
8,tuổi,I-QUANTITY-AGE,Sentence: 1
9,Giờ,B-DATETIME,Sentence: 2
