In [2]:
from util.ner_machine_learning import extract_embeddings_as_features_and_gold, \
    extract_features_and_labels, create_classifier
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB, GaussianNB, MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.svm import SVC
from util.feature_extract import extract_left_token, extract_right_token
import pandas as pd
import numpy as np
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

#conll filepath
train_path = '../data/conll2003.train.conll'
dev_path = '../data/conll2003.dev.conll'

In [3]:
# Load the training data
data = pd.read_csv(train_path, delimiter='\t', quotechar= '|', header = None)
data = data.rename({0:'token', 1:'pos', 2:'chunk_tag',3:'target'}, axis = 1)
# [row, column]
X_train = data[['token','pos','chunk_tag']]
Y_train = data['target']
#print(Y_train)

# Load the test data
data = pd.read_csv(dev_path, delimiter='\t', quotechar='|', header=None)
data = data.rename({0: 'token', 1: 'pos', 2: 'chunk_tag', 3: 'target'}, axis=1)
X_dev = data[['token', 'pos', 'chunk_tag']]
Y_dev = data['target']

In [4]:
# add token in the left to features
X_train['token_left'] = extract_left_token(X_train['token'])
X_dev['token_left'] = extract_left_token(X_dev['token'])

X_train['token_right'] = extract_right_token(X_train['token'])
X_dev['token_right'] = extract_right_token(X_dev['token'])

In [5]:
enc = OneHotEncoder(handle_unknown='ignore')
# When an unknown category is encountered during transform,
# the resulting one-hot encoded columns for this feature will be all zeros.

enc.fit_transform(X_train)
X_train_ohe = enc.transform(X_train)
X_dev_ohe = enc.transform(X_dev)

### Logistic Regression

In [6]:
logreg = LogisticRegression(max_iter=100000)
model = logreg.fit(X_train_ohe, Y_train)
Y_pred = model.predict(X_dev_ohe)
logreg_rpt = classification_report(Y_dev, Y_pred, digits = 3)
print(logreg_rpt)

              precision    recall  f1-score   support

       B-LOC      0.873     0.834     0.853      1837
      B-MISC      0.916     0.736     0.817       922
       B-ORG      0.861     0.723     0.786      1341
       B-PER      0.848     0.870     0.859      1842
       I-LOC      0.902     0.720     0.801       257
      I-MISC      0.904     0.624     0.738       346
       I-ORG      0.817     0.690     0.748       751
       I-PER      0.834     0.937     0.883      1307
           O      0.983     0.995     0.989     42759

    accuracy                          0.964     51362
   macro avg      0.882     0.792     0.830     51362
weighted avg      0.963     0.964     0.962     51362



### Naive Bayes

- https://scikit-learn.org/stable/modules/naive_bayes.html
- Gaussian Naive Bayes
- Multinomial Naive Bayes
- Complement Naive Bayes
- Bernoulli Naive Bayes
- Categorical Naive Bayes

In [9]:

# Complement Naive Bayes
models = [ComplementNB(), MultinomialNB(), BernoulliNB()]
for ind, nb_model in enumerate(models):
    print(f'for the {ind} Naive Bayes model:')
    model = nb_model.fit(X_train_ohe, Y_train)
    Y_pred = model.predict(X_dev_ohe)
    cnb_rpt = classification_report(Y_dev, Y_pred, digits = 3)
    print(cnb_rpt)
    print('------------------')
    print()


for the 0 Naive Bayes model:
              precision    recall  f1-score   support

       B-LOC      0.679     0.898     0.773      1837
      B-MISC      0.691     0.785     0.735       922
       B-ORG      0.754     0.701     0.727      1341
       B-PER      0.852     0.789     0.819      1842
       I-LOC      0.717     0.650     0.682       257
      I-MISC      0.719     0.555     0.626       346
       I-ORG      0.662     0.589     0.623       751
       I-PER      0.761     0.881     0.817      1307
           O      0.989     0.978     0.984     42759

    accuracy                          0.945     51362
   macro avg      0.758     0.758     0.754     51362
weighted avg      0.948     0.945     0.946     51362

------------------

for the 1 Naive Bayes model:
              precision    recall  f1-score   support

       B-LOC      0.750     0.857     0.800      1837
      B-MISC      0.898     0.600     0.719       922
       B-ORG      0.837     0.601     0.700      1341


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-LOC      0.813     0.658     0.727      1837
      B-MISC      0.000     0.000     0.000       922
       B-ORG      0.849     0.248     0.383      1341
       B-PER      0.928     0.419     0.577      1842
       I-LOC      0.000     0.000     0.000       257
      I-MISC      0.000     0.000     0.000       346
       I-ORG      1.000     0.013     0.026       751
       I-PER      0.992     0.277     0.433      1307
           O      0.885     1.000     0.939     42759

    accuracy                          0.885     51362
   macro avg      0.608     0.290     0.343     51362
weighted avg      0.862     0.885     0.850     51362

------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# models = [GaussianNB() ,CategoricalNB()]
# for ind, nb_model in enumerate(models):
#     print(f'for the {ind} Naive Bayes model:')
#     model = nb_model.fit(X_train_ohe.todense(), Y_train)
#     Y_pred = model.predict(X_dev_ohe.todense())
#     cnb_rpt = classification_report(Y_dev, Y_pred)
#     print(cnb_rpt)
#     print('------------------')
#     print()

for the 0 Naive Bayes model:




: 

: 

### SVM

In [8]:

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

#if number of features << number of training examples Gaussian kernel SVM


svc_clf = SVC(kernel='rbf', max_iter = 10000)
model = svc_clf.fit(X_train_ohe, Y_train)
Y_pred = model.predict(X_dev_ohe)
svc_rpt = classification_report(Y_dev, Y_pred, digits = 3)
print(svc_rpt)




              precision    recall  f1-score   support

       B-LOC      0.835     0.812     0.824      1837
      B-MISC      0.930     0.682     0.787       922
       B-ORG      0.836     0.698     0.761      1341
       B-PER      0.791     0.834     0.812      1842
       I-LOC      0.914     0.700     0.793       257
      I-MISC      0.955     0.607     0.742       346
       I-ORG      0.798     0.674     0.731       751
       I-PER      0.706     0.947     0.809      1307
           O      0.986     0.992     0.989     42759

    accuracy                          0.957     51362
   macro avg      0.861     0.772     0.805     51362
weighted avg      0.958     0.957     0.956     51362



In [10]:
from sklearn.model_selection import cross_validate

# Instantiate model
model = SVC(kernel='rbf', max_iter=10000)
cv_results = cross_validate(model, X_dev_ohe, Y_dev, cv=5, scoring='f1_macro')
pd.DataFrame(cv_results)

Unnamed: 0,fit_time,score_time,test_score
0,42.97135,4.697416,0.697998
1,44.976553,4.825301,0.613402
2,47.433181,4.913506,0.714026
3,48.115172,4.713499,0.679762
4,45.271179,4.66395,0.560522


In [15]:
# from sklearn.model_selection import GridSearchCV
# # Hyperparameter tuning
# # https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/

# #Instanciate model
# model = SVC()

# #Hyperparameter Grid
# paras = {'C':[0.1, 1, 10, 100], 'kernel':['linear','poly','rbf','sigmoid','precomputed'],'gamma' :[1, 0.1, 0.01, 0.001]}

# search = GridSearchCV(model, paras, scoring='f1_weighted', verbose = 2 , cv = 5)
