In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore') 

import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, f1_score, accuracy_score, make_scorer

In [3]:
class config:
    DATA = '/kaggle/input/vner-vlsp-2021/processed_data.csv'

# `Load processed dataset`

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [5]:
dataset = import_data(config.DATA)

Memory usage of dataframe is 22.47 MB
Memory usage after optimization is: 9.10 MB
Decreased by 59.5%


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 981556 entries, 120957 to 774467
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Word        981556 non-null   category
 1   Tag         981556 non-null   category
 2   Sentence #  981556 non-null   category
dtypes: category(3)
memory usage: 9.1 MB


In [8]:
tag_counts = dataset['Tag'].value_counts()

tags_to_keep = tag_counts[tag_counts >= 2].index

dataset = dataset[dataset['Tag'].isin(tags_to_keep)]

In [9]:
X = dataset.drop('Tag', axis=1)

In [10]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = dataset.Tag.values

In [11]:
classes = np.unique(y)
classes = classes.tolist()
# classes

In [12]:
new_classes = classes.copy()
new_classes.pop()
# new_classes

'O'

In [13]:
unique, counts = np.unique(y, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution:", class_distribution)

Class distribution: {'B-DATETIME': 29, 'B-DATETIME-DATE': 17, 'B-DATETIME-DATERANGE': 4, 'B-DATETIME-DURATION': 24, 'B-DATETIME-TIME': 2, 'B-DATETIME-TIMERANGE': 7, 'B-EVENT': 7, 'B-EVENT-CUL': 3, 'B-EVENT-GAMESHOW': 5, 'B-EVENT-NATURAL': 3, 'B-EVENT-SPORT': 4, 'B-LOCATION': 74, 'B-LOCATION-GEO': 3, 'B-LOCATION-GPE': 84, 'B-LOCATION-STRUC': 5, 'B-MISCELLANEOUS': 6, 'B-ORGANIZATION': 107, 'B-ORGANIZATION-MED': 3, 'B-ORGANIZATION-SPORTS': 19, 'B-PERSON': 155, 'B-PERSONTYPE': 41, 'B-PHONENUMBER': 3, 'B-PRODUCT': 32, 'B-PRODUCT-COM': 10, 'B-PRODUCT-LEGAL': 5, 'B-QUANTITY': 40, 'B-QUANTITY-AGE': 5, 'B-QUANTITY-CUR': 14, 'B-QUANTITY-DIM': 4, 'B-QUANTITY-NUM': 52, 'B-QUANTITY-ORD': 9, 'B-QUANTITY-PER': 14, 'B-URL': 3, 'I-ADDRESS': 8, 'I-DATETIME': 25, 'I-DATETIME-DATE': 9, 'I-DATETIME-DATERANGE': 13, 'I-DATETIME-DURATION': 28, 'I-DATETIME-SET': 3, 'I-DATETIME-TIME': 7, 'I-DATETIME-TIMERANGE': 19, 'I-EVENT': 21, 'I-EVENT-CUL': 2, 'I-EVENT-GAMESHOW': 17, 'I-EVENT-SPORT': 12, 'I-LOCATION': 22, '

In [16]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test = matrix[:ratio,:]
    Y_train = target[ratio:]
    Y_test = target[:ratio]
    return X_train, X_test, Y_train, Y_test

In [17]:
X_train, X_test, y_train, y_test = shuffle(X, y, 3)

# `Naive Bayes classifier for multinomial models`

In [21]:
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    'fit_prior': [True, False]
}

# Initialize the MultinomialNB classifier
nb = MultinomialNB()

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=nb,
    param_grid=param_grid,
    cv=5,
    verbose=2,
    scoring= make_scorer(f1_score, average='weighted'),
    return_train_score=True
)

In [22]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .......................alpha=0.0001, fit_prior=True; total time=   1.3s
[CV] END .......................alpha=0.0001, fit_prior=True; total time=   0.6s
[CV] END .......................alpha=0.0001, fit_prior=True; total time=   0.6s
[CV] END .......................alpha=0.0001, fit_prior=True; total time=   0.6s
[CV] END .......................alpha=0.0001, fit_prior=True; total time=   0.7s
[CV] END ......................alpha=0.0001, fit_prior=False; total time=   0.6s
[CV] END ......................alpha=0.0001, fit_prior=False; total time=   0.6s
[CV] END ......................alpha=0.0001, fit_prior=False; total time=   0.6s
[CV] END ......................alpha=0.0001, fit_prior=False; total time=   0.7s
[CV] END ......................alpha=0.0001, fit_prior=False; total time=   0.6s
[CV] END ........................alpha=0.001, fit_prior=True; total time=   0.6s
[CV] END ........................alpha=0.001, fi

In [23]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params', 'rank_test_score', 'mean_fit_time']]
grid_search_results

Unnamed: 0,mean_test_score,std_test_score,params,rank_test_score,mean_fit_time
0,0.64393,0.013282,"{'alpha': 0.0001, 'fit_prior': True}",8,0.644791
1,0.239453,0.009409,"{'alpha': 0.0001, 'fit_prior': False}",11,0.529224
2,0.664687,0.016139,"{'alpha': 0.001, 'fit_prior': True}",6,0.539866
3,0.21743,0.016008,"{'alpha': 0.001, 'fit_prior': False}",12,0.524201
4,0.736023,0.005585,"{'alpha': 0.01, 'fit_prior': True}",4,0.523306
5,0.240382,0.009441,"{'alpha': 0.01, 'fit_prior': False}",10,0.517127
6,0.746244,0.004753,"{'alpha': 0.1, 'fit_prior': True}",3,0.569379
7,0.488103,0.018732,"{'alpha': 0.1, 'fit_prior': False}",9,0.524463
8,0.780681,0.000507,"{'alpha': 1.0, 'fit_prior': True}",1,0.527946
9,0.6453,0.005898,"{'alpha': 1.0, 'fit_prior': False}",7,0.515


In [24]:
# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Get the best model
nb = grid_search.best_estimator_

Best parameters: {'alpha': 1.0, 'fit_prior': True}
Best cross-validation score: 0.780680808063704


In [25]:
# nb = MultinomialNB(alpha=0.01)
# nb.fit(X_train, y_train)
# # nb.partial_fit(X_train, y_train, classes)

In [26]:
y_pred_nb = nb.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred_nb)

0.854434250764526

In [28]:
f1_score(y_test, y_pred_nb, average='weighted', zero_division=0)

0.7873645437454108

In [29]:
print(classification_report(y_pred=y_pred_nb, y_true=y_test, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

In [30]:
f1_score(y_test, y_pred_nb, average='weighted', labels=new_classes, zero_division=0)

0.7034370764526

In [31]:
print(classification_report(y_pred=y_pred_nb, y_true=y_test, labels = new_classes, zero_division=0))

                       precision    recall  f1-score   support

                    B       0.00      0.00      0.00         5
            B-ADDRESS       0.00      0.00      0.00        32
           B-DATETIME       0.36      0.23      0.28      1181
      B-DATETIME-DATE       0.36      0.19      0.25       660
 B-DATETIME-DATERANGE       0.33      0.01      0.01       154
  B-DATETIME-DURATION       0.09      0.45      0.15       460
       B-DATETIME-SET       0.00      0.00      0.00        13
      B-DATETIME-TIME       0.09      0.16      0.11       115
 B-DATETIME-TIMERANGE       0.00      0.00      0.00       138
              B-EMAIL       0.00      0.00      0.00        15
              B-EVENT       0.24      0.12      0.16       147
          B-EVENT-CUL       0.00      0.00      0.00        91
     B-EVENT-GAMESHOW       0.39      0.12      0.19        88
      B-EVENT-NATURAL       0.00      0.00      0.00        35
        B-EVENT-SPORT       0.72      0.49      0.59  

# `Save model`

In [47]:
import pickle

with open(f'/kaggle/working/nb_model.pkl', 'wb') as file:
    pickle.dump(nb, file)