In [41]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 
from collections import Counter
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, log_loss

In [42]:
class config:
    DATA = '/kaggle/input/vner-vlsp-2021/processed_data.csv'

# `Load processed dataset`

In [43]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [44]:
dataset = import_data(config.DATA)

Memory usage of dataframe is 22.47 MB
Memory usage after optimization is: 9.10 MB
Decreased by 59.5%


In [46]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9816 entries, 120957 to 774467
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Word        9816 non-null   category
 1   Tag         9816 non-null   category
 2   Sentence #  9816 non-null   category
dtypes: category(3)
memory usage: 9.1 MB


In [47]:
tag_counts = dataset['Tag'].value_counts()

tags_to_keep = tag_counts[tag_counts >= 2].index

dataset = dataset[dataset['Tag'].isin(tags_to_keep)]

In [48]:
X = dataset.drop('Tag', axis=1)

In [49]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = dataset.Tag.values

In [50]:
classes = np.unique(y)
classes = classes.tolist()
# classes

In [51]:
new_classes = classes.copy()
new_classes.pop()
# new_classes

'O'

In [52]:
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

Class distribution: {'B-DATETIME': 29, 'B-DATETIME-DATE': 17, 'B-DATETIME-DATERANGE': 4, 'B-DATETIME-DURATION': 24, 'B-DATETIME-TIME': 2, 'B-DATETIME-TIMERANGE': 7, 'B-EVENT': 7, 'B-EVENT-CUL': 3, 'B-EVENT-GAMESHOW': 5, 'B-EVENT-NATURAL': 3, 'B-EVENT-SPORT': 4, 'B-LOCATION': 74, 'B-LOCATION-GEO': 3, 'B-LOCATION-GPE': 84, 'B-LOCATION-STRUC': 5, 'B-MISCELLANEOUS': 6, 'B-ORGANIZATION': 107, 'B-ORGANIZATION-MED': 3, 'B-ORGANIZATION-SPORTS': 19, 'B-PERSON': 155, 'B-PERSONTYPE': 41, 'B-PHONENUMBER': 3, 'B-PRODUCT': 32, 'B-PRODUCT-COM': 10, 'B-PRODUCT-LEGAL': 5, 'B-QUANTITY': 40, 'B-QUANTITY-AGE': 5, 'B-QUANTITY-CUR': 14, 'B-QUANTITY-DIM': 4, 'B-QUANTITY-NUM': 52, 'B-QUANTITY-ORD': 9, 'B-QUANTITY-PER': 14, 'B-URL': 3, 'I-ADDRESS': 8, 'I-DATETIME': 25, 'I-DATETIME-DATE': 9, 'I-DATETIME-DATERANGE': 13, 'I-DATETIME-DURATION': 28, 'I-DATETIME-SET': 3, 'I-DATETIME-TIME': 7, 'I-DATETIME-TIMERANGE': 19, 'I-EVENT': 21, 'I-EVENT-CUL': 2, 'I-EVENT-GAMESHOW': 17, 'I-EVENT-SPORT': 12, 'I-LOCATION': 22, '

In [53]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test = matrix[:ratio,:]
    Y_train = target[ratio:]
    Y_test = target[:ratio]
    return X_train, X_test, Y_train, Y_test

In [54]:
X_train, X_test, y_train, y_test = shuffle(X, y, 3)

In [55]:
def filter_classes(matrix, target, min_class_count = 2):
    class_counts = Counter(target)
    classes_to_keep = {cls for cls, count in class_counts.items() if count >= min_class_count}
    
    mask = np.isin(target, list(classes_to_keep))
    
    return matrix[mask], target[mask]

In [56]:
X_train, y_train = filter_classes(X, y)

In [58]:
X_train.shape, y_train.shape

((654368, 69313), (654368,))

In [59]:
X_test.shape, y_test.shape

((327184, 69313), (327184,))

# `Linear classifiers with SGD training`

In [60]:
sgd = SGDClassifier(max_iter=10, verbose=10, n_jobs=-1, early_stopping=True, validation_fraction=0.2, n_iter_no_change=5)
sgd.fit(X_train, y_train)
# sgd.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 3.20, NNZs: 9, Bias: -5.804261, T: 7848, Avg. loss: 0.004013
Total training time: 0.23 seconds.
Norm: 6.97, NNZs: 37, Bias: -1.433061, T: 7848, Avg. loss: 0.012145
Total training time: 0.27 seconds.
Norm: 8.31, NNZs: 55, Bias: -2.375032, T: 7848, Avg. loss: 0.018692
Total training time: 0.26 seconds.
Norm: 8.31, NNZs: 59, Bias: -4.088670, T: 7848, Avg. loss: 0.016153
Total training time: 0.25 seconds.
-- Epoch 2
-- Epoch 2
-- Epoch 2
-- Epoch 2
Norm: 6.78, NNZs: 55, Bias: -1.084312, T: 15696, Avg. loss: 0.003287
Total training time: 0.54 seconds.
Norm: 2.93, NNZs: 9, Bias: -3.355973, T: 15696, Avg. loss: 0.001741
Total training time: 0.54 seconds.
-- Epoch 3
Norm: 7.62, NNZs: 84, Bias: -1.040606, T: 15696, Avg. loss: 0.007439
Total training time: 0.57 seconds.
Norm: 7.14, NNZs: 77, Bias: -2.014687, T: 15696, Avg. loss: 0.006380
Total training time: 0.56 seconds.
-- Epoch 3
-- Epoch 3
-- Epoch 3
Norm: 5.85, NNZs: 65, Bias: -1.236630, T: 

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.2s


Norm: 3.82, NNZs: 41, Bias: -1.280943, T: 39240, Avg. loss: 0.000731
Total training time: 1.79 seconds.
-- Epoch 1
-- Epoch 6
Norm: 2.22, NNZs: 15, Bias: -1.089422, T: 39240, Avg. loss: 0.000093
Total training time: 1.63 seconds.
-- Epoch 6
Norm: 4.04, NNZs: 57, Bias: -1.184909, T: 39240, Avg. loss: 0.000480
Total training time: 1.96 seconds.
-- Epoch 6
Norm: 3.57, NNZs: 11, Bias: -2.105281, T: 7848, Avg. loss: 0.003805
Total training time: 0.31 seconds.
Norm: 2.26, NNZs: 19, Bias: -1.315865, T: 47088, Avg. loss: 0.000134
Total training time: 1.94 seconds.
-- Epoch 2
Norm: 3.68, NNZs: 44, Bias: -1.062826, T: 47088, Avg. loss: 0.000576
Total training time: 2.20 seconds.
Convergence after 6 epochs took 1.99 seconds
-- Epoch 7
Norm: 3.80, NNZs: 63, Bias: -1.194400, T: 47088, Avg. loss: 0.000356
Total training time: 2.38 seconds.
-- Epoch 1
Convergence after 6 epochs took 2.43 seconds
-- Epoch 1
Norm: 3.49, NNZs: 16, Bias: -1.144739, T: 15696, Avg. loss: 0.000915
Total training time: 0.74 

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    7.0s


Norm: 2.20, NNZs: 9, Bias: -1.964272, T: 47088, Avg. loss: 0.000322
Total training time: 2.21 seconds.
Norm: 2.54, NNZs: 9, Bias: -2.573217, T: 15696, Avg. loss: 0.000911
Total training time: 0.52 seconds.
-- Epoch 1
Convergence after 6 epochs took 2.30 seconds
Norm: 12.19, NNZs: 468, Bias: -1.131672, T: 39240, Avg. loss: 0.005449
Total training time: 1.86 seconds.
-- Epoch 3
-- Epoch 6
-- Epoch 1
Norm: 17.14, NNZs: 226, Bias: -2.195123, T: 7848, Avg. loss: 0.042285
Total training time: 0.35 seconds.
Norm: 2.44, NNZs: 9, Bias: -1.587364, T: 23544, Avg. loss: 0.000389
Total training time: 0.94 seconds.
Norm: 4.23, NNZs: 14, Bias: -2.375874, T: 7848, Avg. loss: 0.003047
Total training time: 0.16 seconds.
-- Epoch 2
-- Epoch 4
Norm: 11.82, NNZs: 519, Bias: -1.567845, T: 47088, Avg. loss: 0.004849
Total training time: 2.25 seconds.
-- Epoch 2Convergence after 6 epochs took 2.30 seconds

-- Epoch 1
Norm: 14.87, NNZs: 322, Bias: -1.278360, T: 15696, Avg. loss: 0.013924
Total training time: 0

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   10.7s


-- Epoch 3
-- Epoch 3
Norm: 2.51, NNZs: 6, Bias: -2.347420, T: 31392, Avg. loss: 0.000437
Total training time: 1.38 seconds.
-- Epoch 5
Norm: 12.38, NNZs: 111, Bias: -1.467932, T: 7848, Avg. loss: 0.021794
Total training time: 0.24 seconds.
-- Epoch 2
Norm: 6.70, NNZs: 82, Bias: -1.192427, T: 23544, Avg. loss: 0.001713
Total training time: 0.94 seconds.
-- Epoch 4
Norm: 19.81, NNZs: 835, Bias: -1.027827, T: 23544, Avg. loss: 0.010256
Total training time: 1.04 seconds.
-- Epoch 4
Norm: 2.41, NNZs: 7, Bias: -2.092549, T: 39240, Avg. loss: 0.000350
Total training time: 1.76 seconds.
Norm: 10.31, NNZs: 152, Bias: -1.643969, T: 15696, Avg. loss: 0.007878
Total training time: 0.60 seconds.
Norm: 6.28, NNZs: 92, Bias: -1.211998, T: 31392, Avg. loss: 0.001140
Total training time: 1.24 seconds.
-- Epoch 6
-- Epoch 3
-- Epoch 5
Norm: 18.30, NNZs: 974, Bias: -1.042957, T: 31392, Avg. loss: 0.008824
Total training time: 1.40 seconds.
-- Epoch 5
Norm: 9.66, NNZs: 191, Bias: -1.531935, T: 23544, Avg

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   15.0s


Convergence after 6 epochs took 2.18 seconds
Norm: 3.92, NNZs: 14, Bias: -5.122722, T: 7848, Avg. loss: 0.005329
Total training time: 0.34 seconds.
-- Epoch 1
-- Epoch 2
Norm: 7.15, NNZs: 41, Bias: -1.336900, T: 7848, Avg. loss: 0.006983
Total training time: 0.34 seconds.
Norm: 9.31, NNZs: 193, Bias: -1.463828, T: 23544, Avg. loss: 0.005824
Total training time: 1.02 seconds.
-- Epoch 4
-- Epoch 2
Norm: 3.39, NNZs: 14, Bias: -1.946636, T: 15696, Avg. loss: 0.001837
Total training time: 0.64 seconds.
Norm: 3.20, NNZs: 8, Bias: -4.508229, T: 7848, Avg. loss: 0.003466
Total training time: 0.31 seconds.
-- Epoch 3
-- Epoch 2
Norm: 5.93, NNZs: 56, Bias: -2.438098, T: 15696, Avg. loss: 0.003270
Total training time: 0.70 seconds.
Norm: 8.85, NNZs: 226, Bias: -1.109514, T: 31392, Avg. loss: 0.004536
Total training time: 1.38 seconds.
-- Epoch 5
-- Epoch 3
Norm: 3.10, NNZs: 20, Bias: -2.313031, T: 23544, Avg. loss: 0.000850
Total training time: 1.00 seconds.
Norm: 2.93, NNZs: 9, Bias: -2.797031,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.3s


Norm: 7.33, NNZs: 120, Bias: -1.031278, T: 23544, Avg. loss: 0.003810
Total training time: 1.09 seconds.
-- Epoch 4
Norm: 3.78, NNZs: 35, Bias: -1.456587, T: 31392, Avg. loss: 0.001098
Total training time: 1.52 seconds.
Norm: 5.06, NNZs: 52, Bias: -1.055925, T: 23544, Avg. loss: 0.000630
Total training time: 0.90 seconds.
-- Epoch 5
Norm: 6.19, NNZs: 33, Bias: -2.232920, T: 7848, Avg. loss: 0.007714
Total training time: 0.23 seconds.
-- Epoch 4
-- Epoch 2
Norm: 6.99, NNZs: 140, Bias: -1.389168, T: 31392, Avg. loss: 0.002784
Total training time: 1.45 seconds.
Norm: 4.30, NNZs: 56, Bias: -1.067004, T: 31392, Avg. loss: 0.000286
Total training time: 1.15 seconds.
-- Epoch 5
Norm: 3.79, NNZs: 41, Bias: -1.427040, T: 39240, Avg. loss: 0.000714
Total training time: 1.88 seconds.
-- Epoch 5
Norm: 5.22, NNZs: 44, Bias: -1.982798, T: 15696, Avg. loss: 0.002728
Total training time: 0.53 seconds.
-- Epoch 6
-- Epoch 3
Norm: 6.63, NNZs: 154, Bias: -1.381265, T: 39240, Avg. loss: 0.002221
Total tra

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.1s


Norm: 2.37, NNZs: 7, Bias: -1.618583, T: 47088, Avg. loss: 0.000213
Total training time: 2.24 seconds.
Convergence after 6 epochs took 2.28 seconds
Norm: 5.26, NNZs: 66, Bias: -1.073529, T: 31392, Avg. loss: 0.001085
Total training time: 1.65 seconds.
-- Epoch 5
Norm: 8.90, NNZs: 66, Bias: -1.567431, T: 7848, Avg. loss: 0.013054
Total training time: 0.33 seconds.
-- Epoch 1
-- Epoch 2
Norm: 5.40, NNZs: 106, Bias: -1.569909, T: 39240, Avg. loss: 0.001910
Total training time: 1.94 seconds.
-- Epoch 6
Norm: 4.98, NNZs: 72, Bias: -1.103849, T: 39240, Avg. loss: 0.000966
Total training time: 2.06 seconds.
Norm: 3.92, NNZs: 13, Bias: -2.233944, T: 7848, Avg. loss: 0.004590
Total training time: 0.32 seconds.
Norm: 5.31, NNZs: 114, Bias: -1.313498, T: 47088, Avg. loss: 0.001872
Total training time: 2.18 seconds.
Norm: 8.08, NNZs: 94, Bias: -1.680368, T: 15696, Avg. loss: 0.004039
Total training time: 0.68 seconds.
Convergence after 6 epochs took 2.25 seconds
-- Epoch 2
-- Epoch 6
-- Epoch 3
--

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   33.7s


Norm: 2.53, NNZs: 8, Bias: -3.910178, T: 47088, Avg. loss: 0.000725
Total training time: 2.10 seconds.
-- Epoch 1
-- Epoch 1
Convergence after 6 epochs took 2.14 seconds
-- Epoch 1
Norm: 8.28, NNZs: 274, Bias: -1.331572, T: 47088, Avg. loss: 0.003148
Total training time: 2.09 seconds.
Convergence after 6 epochs took 2.15 seconds
Norm: 10.72, NNZs: 98, Bias: -2.125466, T: 7848, Avg. loss: 0.018170
Total training time: 0.26 seconds.
Norm: 3.20, NNZs: 5, Bias: -4.778865, T: 7848, Avg. loss: 0.002265
Total training time: 0.32 seconds.
-- Epoch 2
-- Epoch 2
-- Epoch 1
Norm: 4.80, NNZs: 15, Bias: -1.059648, T: 7848, Avg. loss: 0.003285
Total training time: 0.36 seconds.
-- Epoch 2
Norm: 9.20, NNZs: 133, Bias: -1.729302, T: 15696, Avg. loss: 0.006061
Total training time: 0.62 seconds.
Norm: 3.05, NNZs: 5, Bias: -3.517562, T: 15696, Avg. loss: 0.000830
Total training time: 0.67 seconds.
Norm: 6.39, NNZs: 35, Bias: -1.559402, T: 7848, Avg. loss: 0.007203
Total training time: 0.26 seconds.
-- Ep

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   40.1s


Norm: 2.51, NNZs: 15, Bias: -1.769140, T: 31392, Avg. loss: 0.000623
Total training time: 1.41 seconds.
-- Epoch 1
-- Epoch 6
-- Epoch 5
Norm: 7.09, NNZs: 200, Bias: -1.218024, T: 78480, Avg. loss: 0.001715
Total training time: 3.66 seconds.
-- Epoch 1
Norm: 5.98, NNZs: 15, Bias: -2.961388, T: 7848, Avg. loss: 0.003511
Total training time: 0.32 seconds.
Norm: 2.18, NNZs: 14, Bias: -2.031837, T: 47088, Avg. loss: 0.000712
Total training time: 1.98 seconds.
Norm: 2.41, NNZs: 19, Bias: -2.048468, T: 39240, Avg. loss: 0.000510
Total training time: 1.81 seconds.
-- Epoch 2
Convergence after 6 epochs took 2.07 seconds
-- Epoch 6
-- Epoch 1
Norm: 4.48, NNZs: 19, Bias: -1.588482, T: 15696, Avg. loss: 0.001013
Total training time: 0.65 seconds.
Norm: 3.20, NNZs: 8, Bias: -2.220993, T: 7848, Avg. loss: 0.003062
Total training time: 0.38 seconds.
-- Epoch 3
Norm: 2.39, NNZs: 21, Bias: -1.779830, T: 47088, Avg. loss: 0.000528
Total training time: 2.15 seconds.
-- Epoch 2
Convergence after 6 epochs

[Parallel(n_jobs=-1)]: Done  73 out of  73 | elapsed:   44.8s finished


In [61]:
y_pred_sgd = sgd.predict(X_test)

In [62]:
accuracy_score(y_test, y_pred_sgd)

0.9599388379204893

In [63]:
f1_score(y_test, y_pred_sgd, average='weighted', zero_division=0)

0.9553616197656198

In [64]:
print(classification_report(y_pred=y_pred_sgd, y_true=y_test, zero_division=0))

                       precision    recall  f1-score   support

           B-DATETIME       1.00      0.83      0.91         6
      B-DATETIME-DATE       1.00      0.50      0.67         6
 B-DATETIME-DATERANGE       1.00      1.00      1.00         2
  B-DATETIME-DURATION       1.00      0.25      0.40         4
 B-DATETIME-TIMERANGE       1.00      0.33      0.50         3
              B-EVENT       1.00      1.00      1.00         3
     B-EVENT-GAMESHOW       1.00      0.50      0.67         2
      B-EVENT-NATURAL       1.00      0.50      0.67         2
        B-EVENT-SPORT       1.00      1.00      1.00         2
           B-LOCATION       1.00      0.62      0.76        21
       B-LOCATION-GPE       1.00      0.78      0.88        27
     B-LOCATION-STRUC       1.00      1.00      1.00         1
      B-MISCELLANEOUS       1.00      1.00      1.00         2
       B-ORGANIZATION       1.00      0.75      0.86        44
   B-ORGANIZATION-MED       1.00      0.50      0.67  

In [65]:
f1_score(y_test, y_pred_sgd, average='weighted', labels=new_classes, zero_division=0)

0.8258385783618561

In [66]:
print(classification_report(y_pred=y_pred_sgd, y_true=y_test, labels=new_classes, zero_division=0))

                       precision    recall  f1-score   support

           B-DATETIME       1.00      0.83      0.91         6
      B-DATETIME-DATE       1.00      0.50      0.67         6
 B-DATETIME-DATERANGE       1.00      1.00      1.00         2
  B-DATETIME-DURATION       1.00      0.25      0.40         4
      B-DATETIME-TIME       0.00      0.00      0.00         0
 B-DATETIME-TIMERANGE       1.00      0.33      0.50         3
              B-EVENT       1.00      1.00      1.00         3
          B-EVENT-CUL       0.00      0.00      0.00         0
     B-EVENT-GAMESHOW       1.00      0.50      0.67         2
      B-EVENT-NATURAL       1.00      0.50      0.67         2
        B-EVENT-SPORT       1.00      1.00      1.00         2
           B-LOCATION       1.00      0.62      0.76        21
       B-LOCATION-GEO       0.00      0.00      0.00         0
       B-LOCATION-GPE       1.00      0.78      0.88        27
     B-LOCATION-STRUC       1.00      1.00      1.00  

# `Save model`

In [67]:
import pickle

with open(f'/kaggle/working/sgd_model.pkl', 'wb') as file:
    pickle.dump(sgd, file)