In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

from collections import Counter
import numpy as np
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss

In [2]:
class config:
    DATA = '/kaggle/input/vner-vlsp-2021/processed_data.csv'

# `Load processed dataset`

In [3]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [4]:
dataset = import_data(config.DATA)

Memory usage of dataframe is 22.47 MB
Memory usage after optimization is: 9.10 MB
Decreased by 59.5%


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981555 entries, 0 to 981554
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   Word        981555 non-null  category
 1   Tag         981555 non-null  category
 2   Sentence #  981555 non-null  category
dtypes: category(3)
memory usage: 9.1 MB


In [7]:
tag_counts = dataset['Tag'].value_counts()

tags_to_keep = tag_counts[tag_counts >= 2].index

dataset = dataset[dataset['Tag'].isin(tags_to_keep)]

In [8]:
X = dataset.drop('Tag', axis=1)

In [9]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = dataset.Tag.values

In [10]:
classes = np.unique(y)
classes = classes.tolist()
# classes

In [11]:
new_classes = classes.copy()
new_classes.pop()
# new_classes

'O'

In [12]:
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

Class distribution: {'B': 55, 'B-ADDRESS': 80, 'B-DATETIME': 3230, 'B-DATETIME-DATE': 1862, 'B-DATETIME-DATERANGE': 438, 'B-DATETIME-DURATION': 1546, 'B-DATETIME-SET': 43, 'B-DATETIME-TIME': 359, 'B-DATETIME-TIMERANGE': 421, 'B-EMAIL': 43, 'B-EVENT': 615, 'B-EVENT-CUL': 225, 'B-EVENT-GAMESHOW': 394, 'B-EVENT-NATURAL': 103, 'B-EVENT-SPORT': 596, 'B-IP': 65, 'B-LOCATION': 6250, 'B-LOCATION-GEO': 460, 'B-LOCATION-GPE': 8831, 'B-LOCATION-GPE HCM': 6, 'B-LOCATION-STRUC': 641, 'B-MISCELLANEOUS': 634, 'B-ORGANIZATION': 9981, 'B-ORGANIZATION-MED': 303, 'B-ORGANIZATION-SPORTS': 2017, 'B-ORGANIZATION-STOCK': 47, 'B-PERSON': 15409, 'B-PERSONTYPE': 5015, 'B-PHONENUMBER': 165, 'B-PRODUCT': 2774, 'B-PRODUCT-AWARD': 94, 'B-PRODUCT-COM': 1272, 'B-PRODUCT-LEGAL': 304, 'B-QUANTITY': 3345, 'B-QUANTITY-AGE': 649, 'B-QUANTITY-CUR': 1483, 'B-QUANTITY-DIM': 688, 'B-QUANTITY-NUM': 5945, 'B-QUANTITY-ORD': 1077, 'B-QUANTITY-PER': 1207, 'B-QUANTITY-TEM': 85, 'B-SKILL': 37, 'B-URL': 216, 'I': 16, 'I-ADDRESS': 566

In [15]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test = matrix[:ratio,:]
    Y_train = target[ratio:]
    Y_test = target[:ratio]
    return X_train, X_test, Y_train, Y_test

In [16]:
X_train, X_test, y_train, y_test = shuffle(X, y, 3)

In [18]:
X_train.shape, y_train.shape

((654368, 69313), (654368,))

In [19]:
X_test.shape, y_test.shape

((327184, 69313), (327184,))

# `Perceptron`

In [20]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=100, early_stopping=True, validation_fraction=0.2, n_iter_no_change=5)
per.fit(X_train, y_train)
# per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 44.70, NNZs: 1695, Bias: -4.000000, T: 654368, Avg. loss: 0.003449
Total training time: 97.84 seconds.
-- Epoch 2
Norm: 8.00, NNZs: 61, Bias: -3.000000, T: 654368, Avg. loss: 0.000180
Total training time: 98.09 seconds.
-- Epoch 2
Norm: 10.39, NNZs: 105, Bias: -2.000000, T: 654368, Avg. loss: 0.000139
Total training time: 98.88 seconds.
-- Epoch 2
Norm: 55.17, NNZs: 2476, Bias: -3.000000, T: 654368, Avg. loss: 0.005995
Total training time: 99.97 seconds.
-- Epoch 2
Norm: 66.93, NNZs: 2481, Bias: -5.000000, T: 1308736, Avg. loss: 0.002638
Total training time: 196.05 seconds.
-- Epoch 3
Norm: 9.17, NNZs: 75, Bias: -3.000000, T: 1308736, Avg. loss: 0.000214
Total training time: 196.34 seconds.
-- Epoch 3
Norm: 15.62, NNZs: 175, Bias: -3.000000, T: 1308736, Avg. loss: 0.000105
Total training time: 196.93 seconds.
-- Epoch 3
Norm: 78.83, NNZs: 3414, Bias: -3.000000, T: 1308736, Avg. loss: 0.005042
Total training time: 199.19 seconds.
-- Epoc

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 19.6min


Norm: 115.71, NNZs: 3894, Bias: -5.000000, T: 3926208, Avg. loss: 0.001663
Total training time: 589.36 seconds.
Convergence after 6 epochs took 589.36 seconds
-- Epoch 1
Norm: 20.45, NNZs: 162, Bias: -5.000000, T: 3926208, Avg. loss: 0.000058
Total training time: 589.24 seconds.
Convergence after 6 epochs took 589.24 seconds
-- Epoch 1
Norm: 55.64, NNZs: 939, Bias: -5.000000, T: 3926208, Avg. loss: 0.000310
Total training time: 591.66 seconds.
Convergence after 6 epochs took 591.66 seconds
-- Epoch 1
Norm: 23.45, NNZs: 502, Bias: -3.000000, T: 654368, Avg. loss: 0.000818
Total training time: 98.36 seconds.
-- Epoch 2
Norm: 9.80, NNZs: 93, Bias: -1.000000, T: 654368, Avg. loss: 0.000047
Total training time: 98.63 seconds.
-- Epoch 2
Norm: 31.08, NNZs: 758, Bias: -4.000000, T: 654368, Avg. loss: 0.001253
Total training time: 97.81 seconds.
-- Epoch 2
Norm: 17.32, NNZs: 218, Bias: -2.000000, T: 654368, Avg. loss: 0.000336
Total training time: 99.11 seconds.
-- Epoch 2
Norm: 34.93, NNZs: 7

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 29.5min


Norm: 75.31, NNZs: 1741, Bias: -5.000000, T: 3926208, Avg. loss: 0.000533
Total training time: 588.32 seconds.
Convergence after 6 epochs took 588.32 seconds
-- Epoch 1
Norm: 39.72, NNZs: 462, Bias: -7.000000, T: 3926208, Avg. loss: 0.000115
Total training time: 589.22 seconds.
Convergence after 6 epochs took 589.22 seconds
-- Epoch 1
Norm: 26.83, NNZs: 468, Bias: -4.000000, T: 654368, Avg. loss: 0.000666
Total training time: 99.01 seconds.
-- Epoch 2
Norm: 11.83, NNZs: 102, Bias: -2.000000, T: 654368, Avg. loss: 0.000179
Total training time: 98.89 seconds.
-- Epoch 2
Norm: 27.39, NNZs: 469, Bias: -2.000000, T: 654368, Avg. loss: 0.000717
Total training time: 97.14 seconds.
-- Epoch 2
Norm: 6.63, NNZs: 41, Bias: -3.000000, T: 654368, Avg. loss: 0.000044
Total training time: 97.79 seconds.
-- Epoch 2
Norm: 37.74, NNZs: 721, Bias: -4.000000, T: 1308736, Avg. loss: 0.000434
Total training time: 196.65 seconds.
-- Epoch 3
Norm: 15.68, NNZs: 141, Bias: -5.000000, T: 1308736, Avg. loss: 0.00

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 49.1min


Norm: 5.83, NNZs: 20, Bias: -3.000000, T: 3926208, Avg. loss: 0.000024
Total training time: 585.92 seconds.
Convergence after 6 epochs took 585.92 seconds
-- Epoch 1
Norm: 200.98, NNZs: 9045, Bias: -5.000000, T: 3926208, Avg. loss: 0.004835
Total training time: 599.03 seconds.
-- Epoch 7
Norm: 195.08, NNZs: 8943, Bias: -5.000000, T: 3926208, Avg. loss: 0.004170
Total training time: 601.93 seconds.
-- Epoch 7
Norm: 30.69, NNZs: 739, Bias: -3.000000, T: 654368, Avg. loss: 0.001187
Total training time: 98.47 seconds.
-- Epoch 2
Norm: 28.07, NNZs: 691, Bias: -3.000000, T: 654368, Avg. loss: 0.001213
Total training time: 98.15 seconds.
-- Epoch 2
Norm: 211.01, NNZs: 9247, Bias: -5.000000, T: 4580576, Avg. loss: 0.004646
Total training time: 697.05 seconds.
-- Epoch 8
Norm: 205.85, NNZs: 9251, Bias: -5.000000, T: 4580576, Avg. loss: 0.004155
Total training time: 701.81 seconds.
-- Epoch 8
Norm: 44.79, NNZs: 1133, Bias: -3.000000, T: 1308736, Avg. loss: 0.000925
Total training time: 196.65 se

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed: 65.8min


Norm: 192.18, NNZs: 14632, Bias: -1.000000, T: 1308736, Avg. loss: 0.006836
Total training time: 203.36 seconds.
-- Epoch 3
Norm: 15.62, NNZs: 106, Bias: -4.000000, T: 3271840, Avg. loss: 0.000009
Total training time: 488.09 seconds.
-- Epoch 6
Norm: 101.58, NNZs: 2821, Bias: -5.000000, T: 3271840, Avg. loss: 0.000715
Total training time: 494.05 seconds.
-- Epoch 6
Norm: 73.91, NNZs: 3632, Bias: -4.000000, T: 654368, Avg. loss: 0.006826
Total training time: 99.39 seconds.
-- Epoch 2
Norm: 219.51, NNZs: 16246, Bias: -3.000000, T: 1963104, Avg. loss: 0.004565
Total training time: 302.19 seconds.
-- Epoch 4
Norm: 16.55, NNZs: 111, Bias: -4.000000, T: 3926208, Avg. loss: 0.000009
Total training time: 586.59 seconds.
Convergence after 6 epochs took 586.59 seconds
-- Epoch 1
Norm: 107.42, NNZs: 2922, Bias: -5.000000, T: 3926208, Avg. loss: 0.000691
Total training time: 592.46 seconds.
-- Epoch 7
Norm: 104.12, NNZs: 4951, Bias: -5.000000, T: 1308736, Avg. loss: 0.004753
Total training time: 1

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 88.4min


Norm: 152.99, NNZs: 7050, Bias: -5.000000, T: 3271840, Avg. loss: 0.002691
Total training time: 494.25 seconds.
-- Epoch 6
Norm: 45.06, NNZs: 1697, Bias: -2.000000, T: 654368, Avg. loss: 0.002402
Total training time: 99.18 seconds.
-- Epoch 2
Norm: 46.65, NNZs: 1249, Bias: -3.000000, T: 1308736, Avg. loss: 0.000925
Total training time: 196.15 seconds.
-- Epoch 3
Norm: 32.16, NNZs: 909, Bias: -2.000000, T: 654368, Avg. loss: 0.001375
Total training time: 98.04 seconds.
-- Epoch 2
Norm: 163.87, NNZs: 7435, Bias: -5.000000, T: 3926208, Avg. loss: 0.002638
Total training time: 592.93 seconds.
-- Epoch 7
Norm: 67.01, NNZs: 2493, Bias: -3.000000, T: 1308736, Avg. loss: 0.001631
Total training time: 198.12 seconds.
-- Epoch 3
Norm: 58.70, NNZs: 1501, Bias: -4.000000, T: 1963104, Avg. loss: 0.000636
Total training time: 293.30 seconds.
-- Epoch 4
Norm: 48.99, NNZs: 1411, Bias: -3.000000, T: 1308736, Avg. loss: 0.000949
Total training time: 196.84 seconds.
-- Epoch 3
Norm: 173.05, NNZs: 7755, B

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 115.0min


Norm: 47.39, NNZs: 619, Bias: -5.000000, T: 3271840, Avg. loss: 0.000159
Total training time: 491.45 seconds.
-- Epoch 6
Norm: 36.69, NNZs: 504, Bias: -3.000000, T: 3926208, Avg. loss: 0.000009
Total training time: 590.09 seconds.
Convergence after 6 epochs took 590.09 seconds
-- Epoch 1
Norm: 7.35, NNZs: 32, Bias: -3.000000, T: 3926208, Avg. loss: 0.000067
Total training time: 588.42 seconds.
Convergence after 6 epochs took 588.43 seconds
-- Epoch 1
Norm: 62.13, NNZs: 2653, Bias: -4.000000, T: 654368, Avg. loss: 0.005259
Total training time: 99.96 seconds.
-- Epoch 2
Norm: 50.66, NNZs: 647, Bias: -5.000000, T: 3926208, Avg. loss: 0.000150
Total training time: 589.46 seconds.
Convergence after 6 epochs took 589.46 seconds
-- Epoch 1
Norm: 47.58, NNZs: 1649, Bias: -2.000000, T: 654368, Avg. loss: 0.002439
Total training time: 99.20 seconds.
-- Epoch 2
Norm: 50.42, NNZs: 1404, Bias: -3.000000, T: 654368, Avg. loss: 0.002578
Total training time: 98.75 seconds.
-- Epoch 2
Norm: 88.07, NNZs

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 139.2min


Norm: 80.76, NNZs: 2546, Bias: -4.000000, T: 1308736, Avg. loss: 0.002930
Total training time: 198.87 seconds.
-- Epoch 3
Norm: 41.67, NNZs: 968, Bias: -3.000000, T: 654368, Avg. loss: 0.001658
Total training time: 97.55 seconds.
-- Epoch 2
Norm: 33.14, NNZs: 512, Bias: -3.000000, T: 1308736, Avg. loss: 0.000336
Total training time: 197.20 seconds.
-- Epoch 3
Norm: 15.43, NNZs: 146, Bias: -4.000000, T: 654368, Avg. loss: 0.000209
Total training time: 98.60 seconds.
-- Epoch 2
Norm: 95.70, NNZs: 2916, Bias: -6.000000, T: 1963104, Avg. loss: 0.002578
Total training time: 296.83 seconds.
-- Epoch 4
Norm: 58.31, NNZs: 1327, Bias: -5.000000, T: 1308736, Avg. loss: 0.001068
Total training time: 196.61 seconds.
-- Epoch 3
Norm: 40.25, NNZs: 594, Bias: -5.000000, T: 1963104, Avg. loss: 0.000254
Total training time: 295.09 seconds.
-- Epoch 4
Norm: 21.49, NNZs: 215, Bias: -4.000000, T: 1308736, Avg. loss: 0.000122
Total training time: 197.12 seconds.
-- Epoch 3
Norm: 107.16, NNZs: 3192, Bias: -

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 168.7min


Norm: 43.61, NNZs: 856, Bias: -4.000000, T: 1308736, Avg. loss: 0.000657
Total training time: 197.71 seconds.
-- Epoch 3
Norm: 60.73, NNZs: 1515, Bias: -5.000000, T: 1308736, Avg. loss: 0.000880
Total training time: 196.59 seconds.
-- Epoch 3
Norm: 240.29, NNZs: 12854, Bias: -5.000000, T: 3926208, Avg. loss: 0.011008
Total training time: 605.69 seconds.
-- Epoch 7
Norm: 5.48, NNZs: 30, Bias: -2.000000, T: 654368, Avg. loss: 0.000029
Total training time: 98.15 seconds.
-- Epoch 2
Norm: 52.74, NNZs: 994, Bias: -5.000000, T: 1963104, Avg. loss: 0.000515
Total training time: 295.84 seconds.
-- Epoch 4
Norm: 71.65, NNZs: 1749, Bias: -5.000000, T: 1963104, Avg. loss: 0.000619
Total training time: 294.61 seconds.
-- Epoch 4
Norm: 252.28, NNZs: 13199, Bias: -5.000000, T: 4580576, Avg. loss: 0.010995
Total training time: 706.63 seconds.
-- Epoch 8
Norm: 7.75, NNZs: 42, Bias: -3.000000, T: 1308736, Avg. loss: 0.000021
Total training time: 196.78 seconds.
-- Epoch 3
Norm: 59.48, NNZs: 1098, Bias:

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 206.7min


Norm: 45.80, NNZs: 904, Bias: -6.000000, T: 1963104, Avg. loss: 0.000558
Total training time: 296.51 seconds.
-- Epoch 4
Norm: 76.79, NNZs: 1482, Bias: -6.000000, T: 3926208, Avg. loss: 0.000193
Total training time: 590.46 seconds.
Convergence after 6 epochs took 590.46 seconds
-- Epoch 1
Norm: 113.81, NNZs: 2581, Bias: -6.000000, T: 4580576, Avg. loss: 0.000465
Total training time: 685.64 seconds.
Convergence after 7 epochs took 685.64 seconds
-- Epoch 1
Norm: 27.57, NNZs: 619, Bias: -3.000000, T: 654368, Avg. loss: 0.001042
Total training time: 99.10 seconds.
-- Epoch 2
Norm: 53.10, NNZs: 1008, Bias: -4.000000, T: 2617472, Avg. loss: 0.000468
Total training time: 396.29 seconds.
-- Epoch 5
Norm: 30.72, NNZs: 539, Bias: -4.000000, T: 654368, Avg. loss: 0.000894
Total training time: 96.69 seconds.
-- Epoch 2
Norm: 19.39, NNZs: 178, Bias: -3.000000, T: 654368, Avg. loss: 0.000249
Total training time: 97.73 seconds.
-- Epoch 2
Norm: 42.10, NNZs: 900, Bias: -3.000000, T: 1308736, Avg. los

[Parallel(n_jobs=-1)]: Done  87 out of  87 | elapsed: 235.2min finished


In [21]:
y_pred_per = per.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred_per)

0.8604577240940877

In [23]:
f1_score(y_test, y_pred_per, average='weighted', zero_division=0)

0.8495201254201924

In [24]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

In [25]:
f1_score(y_test, y_pred_per, average='weighted', labels=new_classes, zero_division=0)

0.7548397346005113

In [26]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

# `Save model`

In [54]:
import pickle

with open(f'/kaggle/working/perceptron_model.pkl', 'wb') as file:
    pickle.dump(per, file)