In [29]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

from collections import Counter

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score

In [30]:
class config:
    DATA = '/kaggle/input/vner-vlsp-2021/processed_data.csv'

# `Load processed dataset`

In [31]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [32]:
dataset = import_data(config.DATA)

Memory usage of dataframe is 22.47 MB
Memory usage after optimization is: 9.10 MB
Decreased by 59.5%


In [34]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 981556 entries, 120957 to 82936
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Word        981556 non-null  category
 1   Tag         981556 non-null  category
 2   Sentence #  981556 non-null  category
dtypes: category(3)
memory usage: 9.1 MB


In [35]:
tag_counts = dataset['Tag'].value_counts()

tags_to_keep = tag_counts[tag_counts >= 2].index

dataset = dataset[dataset['Tag'].isin(tags_to_keep)]

In [36]:
X = dataset.drop('Tag', axis=1)

In [37]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = dataset.Tag.values

In [38]:
classes = np.unique(y)
classes = classes.tolist()
# classes

In [39]:
new_classes = classes.copy()
new_classes.pop()
# new_classes

'O'

In [40]:
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

Class distribution: {'B': 3, 'B-ADDRESS': 5, 'B-DATETIME': 145, 'B-DATETIME-DATE': 83, 'B-DATETIME-DATERANGE': 26, 'B-DATETIME-DURATION': 83, 'B-DATETIME-SET': 2, 'B-DATETIME-TIME': 16, 'B-DATETIME-TIMERANGE': 23, 'B-EVENT': 28, 'B-EVENT-CUL': 15, 'B-EVENT-GAMESHOW': 22, 'B-EVENT-NATURAL': 8, 'B-EVENT-SPORT': 33, 'B-IP': 5, 'B-LOCATION': 328, 'B-LOCATION-GEO': 20, 'B-LOCATION-GPE': 428, 'B-LOCATION-STRUC': 32, 'B-MISCELLANEOUS': 32, 'B-ORGANIZATION': 493, 'B-ORGANIZATION-MED': 19, 'B-ORGANIZATION-SPORTS': 94, 'B-PERSON': 787, 'B-PERSONTYPE': 226, 'B-PHONENUMBER': 7, 'B-PRODUCT': 123, 'B-PRODUCT-AWARD': 6, 'B-PRODUCT-COM': 64, 'B-PRODUCT-LEGAL': 12, 'B-QUANTITY': 189, 'B-QUANTITY-AGE': 32, 'B-QUANTITY-CUR': 66, 'B-QUANTITY-DIM': 21, 'B-QUANTITY-NUM': 276, 'B-QUANTITY-ORD': 53, 'B-QUANTITY-PER': 61, 'B-QUANTITY-TEM': 3, 'B-SKILL': 2, 'B-URL': 18, 'I': 2, 'I-ADDRESS': 30, 'I-DATETIME': 157, 'I-DATETIME-DATE': 71, 'I-DATETIME-DATERANGE': 86, 'I-DATETIME-DURATION': 138, 'I-DATETIME-SET': 15

In [41]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test = matrix[:ratio,:]
    Y_train = target[ratio:]
    Y_test = target[:ratio]
    return X_train, X_test, Y_train, Y_test

In [42]:
X_train, X_test, y_train, y_test = shuffle(X, y, 3)

In [43]:
def filter_classes(matrix, target, min_class_count = 2):
    class_counts = Counter(target)
    classes_to_keep = {cls for cls, count in class_counts.items() if count >= min_class_count}
    
    mask = np.isin(target, list(classes_to_keep))
    
    return matrix[mask], target[mask]


In [44]:
X_train, y_train = filter_classes(X_train, y_train)

In [45]:
# from dask_ml.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [46]:
X_train.shape, y_train.shape

((654368, 69313), (654368,))

In [47]:
X_test.shape, y_test.shape

((327184, 69313), (327184,))

# `Passive Aggressive Classifier`

In [48]:
pa = PassiveAggressiveClassifier(verbose=10, n_jobs=-1, max_iter=100, early_stopping=True, validation_fraction=0.2, n_iter_no_change=5)
pa.fit(X_train, y_train)
# # pa.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1-- Epoch 1

-- Epoch 1
-- Epoch 1
Norm: 2.05, NNZs: 320, Bias: -1.375000, T: 26172, Avg. loss: 0.000439
Total training time: 2.63 seconds.
Norm: 1.91, NNZs: 227, Bias: -2.242187, T: 26172, Avg. loss: 0.000477
Total training time: 2.78 seconds.
Norm: 8.57, NNZs: 2339, Bias: -0.996837, T: 26172, Avg. loss: 0.006727
Total training time: 2.79 seconds.
-- Epoch 2
Norm: 10.90, NNZs: 3050, Bias: -1.250174, T: 26172, Avg. loss: 0.011638
Total training time: 3.15 seconds.
-- Epoch 2-- Epoch 2

-- Epoch 2
Norm: 2.51, NNZs: 423, Bias: -1.914055, T: 52344, Avg. loss: 0.000194
Total training time: 6.26 seconds.
Norm: 2.50, NNZs: 319, Bias: -2.746093, T: 52344, Avg. loss: 0.000306
Total training time: 6.69 seconds.
Norm: 11.54, NNZs: 2987, Bias: -1.176935, T: 52344, Avg. loss: 0.003256
Total training time: 6.64 seconds.
-- Epoch 3
Norm: 15.27, NNZs: 3620, Bias: -2.207533, T: 52344, Avg. loss: 0.006310
Total training time: 6.97 seconds.
-- Epoch 3
-- Epoch 3
-- Epoch 3
Norm: 3.27, NNZs: 423

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   43.7s


-- Epoch 1
Norm: 7.94, NNZs: 1569, Bias: -2.017857, T: 130860, Avg. loss: 0.000202
Total training time: 16.43 seconds.
Norm: 10.41, NNZs: 1317, Bias: -2.637008, T: 157032, Avg. loss: 0.000223
Total training time: 20.32 seconds.
Convergence after 6 epochs took 20.75 seconds
-- Epoch 6
Norm: 15.16, NNZs: 2231, Bias: -2.387663, T: 157032, Avg. loss: 0.000475
Total training time: 20.96 seconds.
-- Epoch 1
Convergence after 6 epochs took 21.30 seconds
-- Epoch 1
Norm: 5.04, NNZs: 1240, Bias: -1.921873, T: 26172, Avg. loss: 0.002016
Total training time: 3.66 seconds.
Norm: 8.11, NNZs: 1572, Bias: -1.989931, T: 157032, Avg. loss: 0.000110
Total training time: 20.16 seconds.
-- Epoch 2
Convergence after 6 epochs took 20.79 seconds
Norm: 5.40, NNZs: 1420, Bias: -1.563476, T: 26172, Avg. loss: 0.002395
Total training time: 3.84 seconds.
-- Epoch 1
-- Epoch 2
Norm: 4.65, NNZs: 974, Bias: -1.218262, T: 26172, Avg. loss: 0.001526
Total training time: 3.61 seconds.
-- Epoch 2
Norm: 6.56, NNZs: 1259,

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min


Convergence after 6 epochs took 21.20 seconds
-- Epoch 1
-- Epoch 1
Norm: 7.19, NNZs: 1586, Bias: -1.809224, T: 157032, Avg. loss: 0.000052
Total training time: 20.57 seconds.
Convergence after 6 epochs took 20.86 seconds
Norm: 2.78, NNZs: 439, Bias: -1.906235, T: 26172, Avg. loss: 0.000719
Total training time: 3.51 seconds.
-- Epoch 2
-- Epoch 1
Norm: 3.07, NNZs: 520, Bias: -1.500000, T: 26172, Avg. loss: 0.000707
Total training time: 2.89 seconds.
Norm: 5.99, NNZs: 1758, Bias: -1.818359, T: 26172, Avg. loss: 0.002830
Total training time: 3.48 seconds.
-- Epoch 2
-- Epoch 2
Norm: 3.82, NNZs: 639, Bias: -2.049419, T: 52344, Avg. loss: 0.000364
Total training time: 6.65 seconds.
-- Epoch 3
Norm: 3.83, NNZs: 614, Bias: -1.179688, T: 52344, Avg. loss: 0.000208
Total training time: 5.83 seconds.
Norm: 17.79, NNZs: 6716, Bias: -1.000000, T: 26172, Avg. loss: 0.025324
Total training time: 3.41 seconds.
-- Epoch 3
Norm: 7.57, NNZs: 2274, Bias: -1.462256, T: 52344, Avg. loss: 0.001002
Total tr

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min


-- Epoch 1
Norm: 9.66, NNZs: 2006, Bias: -1.927945, T: 157032, Avg. loss: 0.000127
Total training time: 19.14 seconds.
Convergence after 6 epochs took 19.43 seconds
Norm: 9.61, NNZs: 1664, Bias: -1.829924, T: 104688, Avg. loss: 0.000588
Total training time: 13.16 seconds.
Norm: 30.36, NNZs: 7392, Bias: -1.705023, T: 157032, Avg. loss: 0.001247
Total training time: 20.66 seconds.
-- Epoch 5
-- Epoch 1
-- Epoch 7
Norm: 22.13, NNZs: 8524, Bias: -1.000000, T: 26172, Avg. loss: 0.035082
Total training time: 3.55 seconds.
-- Epoch 2
Norm: 9.94, NNZs: 1670, Bias: -1.978047, T: 130860, Avg. loss: 0.000299
Total training time: 16.36 seconds.
Norm: 30.79, NNZs: 7400, Bias: -1.864715, T: 183204, Avg. loss: 0.001287
Total training time: 23.80 seconds.
Norm: 4.74, NNZs: 1030, Bias: -1.780832, T: 26172, Avg. loss: 0.001637
Total training time: 2.83 seconds.
-- Epoch 6
Convergence after 7 epochs took 24.28 seconds
-- Epoch 2
-- Epoch 1
Norm: 26.93, NNZs: 9271, Bias: -1.867223, T: 52344, Avg. loss: 0.

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.3min


-- Epoch 1
Norm: 10.55, NNZs: 3503, Bias: -1.015730, T: 26172, Avg. loss: 0.008742
Total training time: 3.46 seconds.
-- Epoch 2
Norm: 2.45, NNZs: 318, Bias: -1.000000, T: 52344, Avg. loss: 0.000000
Total training time: 6.05 seconds.
-- Epoch 3Norm: 20.78, NNZs: 5662, Bias: -1.404333, T: 78516, Avg. loss: 0.003761
Total training time: 9.83 seconds.

-- Epoch 4
Norm: 2.35, NNZs: 319, Bias: -1.500000, T: 26172, Avg. loss: 0.000420
Total training time: 2.94 seconds.
-- Epoch 2
Norm: 13.65, NNZs: 3854, Bias: -1.066351, T: 52344, Avg. loss: 0.003789
Total training time: 6.77 seconds.
-- Epoch 3
Norm: 2.45, NNZs: 318, Bias: -1.000000, T: 78516, Avg. loss: 0.000000
Total training time: 9.17 seconds.
-- Epoch 4
Norm: 22.11, NNZs: 5685, Bias: -2.178895, T: 104688, Avg. loss: 0.002707
Total training time: 13.01 seconds.
-- Epoch 5
Norm: 2.90, NNZs: 419, Bias: -1.437500, T: 52344, Avg. loss: 0.000129
Total training time: 6.23 seconds.
-- Epoch 3
Norm: 15.24, NNZs: 4074, Bias: -1.297924, T: 78516,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min


-- Epoch 1
Norm: 25.78, NNZs: 5551, Bias: -2.098543, T: 104688, Avg. loss: 0.004052
Total training time: 14.12 seconds.
Norm: 13.04, NNZs: 2121, Bias: -2.205830, T: 104688, Avg. loss: 0.001071
Total training time: 13.09 seconds.
-- Epoch 5
-- Epoch 5
Norm: 8.48, NNZs: 1769, Bias: -2.031165, T: 157032, Avg. loss: 0.000151
Total training time: 20.27 seconds.
Convergence after 6 epochs took 20.56 seconds
-- Epoch 1
Norm: 13.43, NNZs: 2124, Bias: -2.465326, T: 130860, Avg. loss: 0.000488
Total training time: 16.08 seconds.
Norm: 7.64, NNZs: 2666, Bias: -1.738281, T: 26172, Avg. loss: 0.004571
Total training time: 3.28 seconds.
Norm: 26.85, NNZs: 5563, Bias: -2.298515, T: 130860, Avg. loss: 0.002496
Total training time: 17.33 seconds.
-- Epoch 2
-- Epoch 6
-- Epoch 6
Norm: 2.35, NNZs: 322, Bias: -1.500000, T: 26172, Avg. loss: 0.000420
Total training time: 2.80 seconds.
-- Epoch 2
Norm: 10.18, NNZs: 3283, Bias: -1.471295, T: 52344, Avg. loss: 0.002340
Total training time: 6.45 seconds.
Norm

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min


Norm: 15.41, NNZs: 2875, Bias: -1.925296, T: 78516, Avg. loss: 0.003152
Total training time: 9.29 seconds.
Norm: 14.42, NNZs: 4013, Bias: -1.724419, T: 130860, Avg. loss: 0.000390
Total training time: 17.04 seconds.
-- Epoch 1
-- Epoch 4
-- Epoch 6
Norm: 10.80, NNZs: 2600, Bias: -1.854634, T: 26172, Avg. loss: 0.010542
Total training time: 3.31 seconds.
-- Epoch 2
Norm: 3.73, NNZs: 690, Bias: -1.623968, T: 26172, Avg. loss: 0.001211
Total training time: 2.98 seconds.
-- Epoch 2
Norm: 14.61, NNZs: 4022, Bias: -1.616189, T: 157032, Avg. loss: 0.000216
Total training time: 20.42 seconds.
Norm: 16.55, NNZs: 2905, Bias: -2.244660, T: 104688, Avg. loss: 0.001783
Total training time: 12.81 seconds.
Norm: 15.23, NNZs: 2819, Bias: -2.344445, T: 52344, Avg. loss: 0.006525
Total training time: 6.50 seconds.
Convergence after 6 epochs took 20.97 seconds
-- Epoch 5
-- Epoch 3
-- Epoch 1
Norm: 4.92, NNZs: 799, Bias: -1.686068, T: 52344, Avg. loss: 0.000519
Total training time: 6.58 seconds.
-- Epoch

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.0min


-- Epoch 5
-- Epoch 1
Norm: 3.59, NNZs: 614, Bias: -1.880455, T: 78516, Avg. loss: 0.000135
Total training time: 10.44 seconds.
Norm: 2.80, NNZs: 418, Bias: -1.093719, T: 157032, Avg. loss: 0.000000
Total training time: 22.18 seconds.
-- Epoch 4
Convergence after 6 epochs took 22.58 seconds
-- Epoch 1
Norm: 18.64, NNZs: 6659, Bias: -1.616864, T: 130860, Avg. loss: 0.000811
Total training time: 17.96 seconds.
-- Epoch 6
Norm: 12.27, NNZs: 4559, Bias: -1.001067, T: 26172, Avg. loss: 0.011871
Total training time: 3.49 seconds.
-- Epoch 2
Norm: 3.93, NNZs: 614, Bias: -1.767828, T: 104688, Avg. loss: 0.000105
Total training time: 13.81 seconds.
-- Epoch 5
Norm: 7.26, NNZs: 2163, Bias: -1.337330, T: 26172, Avg. loss: 0.003917
Total training time: 3.24 seconds.
Norm: 18.90, NNZs: 6661, Bias: -1.699322, T: 157032, Avg. loss: 0.000400
Total training time: 21.44 seconds.
-- Epoch 2
Convergence after 6 epochs took 22.02 seconds
Norm: 15.62, NNZs: 5316, Bias: -1.848158, T: 52344, Avg. loss: 0.0052

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.0min


-- Epoch 1
Norm: 4.63, NNZs: 940, Bias: -2.020443, T: 52344, Avg. loss: 0.000510
Total training time: 7.39 seconds.
-- Epoch 3
Norm: 3.74, NNZs: 609, Bias: -1.492188, T: 104688, Avg. loss: 0.000038
Total training time: 14.92 seconds.
Norm: 21.27, NNZs: 5444, Bias: -1.936648, T: 104688, Avg. loss: 0.002575
Total training time: 14.58 seconds.
-- Epoch 5
-- Epoch 5
Norm: 7.73, NNZs: 2445, Bias: -1.698495, T: 26172, Avg. loss: 0.004844
Total training time: 3.13 seconds.
Norm: 5.40, NNZs: 948, Bias: -1.716659, T: 78516, Avg. loss: 0.000318
Total training time: 10.50 seconds.
-- Epoch 2
-- Epoch 4
Norm: 3.80, NNZs: 611, Bias: -1.492188, T: 130860, Avg. loss: 0.000019
Total training time: 18.18 seconds.
Norm: 22.25, NNZs: 5464, Bias: -1.808480, T: 130860, Avg. loss: 0.001977
Total training time: 17.74 seconds.
-- Epoch 6
-- Epoch 6
Norm: 5.54, NNZs: 950, Bias: -1.955081, T: 104688, Avg. loss: 0.000099
Total training time: 13.53 seconds.
Norm: 9.88, NNZs: 2758, Bias: -1.407826, T: 52344, Avg. 

[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  7.8min finished


In [49]:
y_pred_pa = pa.predict(X_test)

In [50]:
accuracy_score(y_test, y_pred_pa)

0.8572563883115295

In [51]:
f1_score(y_test, y_pred_pa, average='weighted', zero_division=0)

0.8322958498116843

In [52]:
print(classification_report(y_pred=y_pred_pa, y_true=y_test, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

In [53]:
f1_score(y_test, y_pred_pa, average='weighted', labels=new_classes, zero_division=0)

0.55217237543659

In [54]:
print(classification_report(y_pred=y_pred_pa, y_true=y_test, labels=new_classes, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

# `Save model`

In [55]:
import pickle

with open(f'/kaggle/working/passive_aggressive_model.pkl', 'wb') as file:
    pickle.dump(pa, file)