In [32]:
from util.ner_machine_learning import extract_embeddings_as_features_and_gold, \
    extract_features_and_labels, create_classifier
from util.basic_evaluation import obtain_counts, calculate_precision_recall_fscore, provide_output_tables
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC, LinearSVC
from util.feature_extract import extract_left_token, extract_right_token
import pandas as pd
from sklearn.model_selection import HalvingGridSearchCV


#conll filepath
train_path = '../data/conll2003.train.conll'
dev_path = '../data/conll2003.dev.conll'

In [33]:
# Load the training data
data = pd.read_csv(train_path, delimiter='\t', quotechar= '|', header = None)
data = data.rename({0:'token', 1:'pos', 2:'chunk_tag',3:'target'}, axis = 1)
# [row, column]
X_train = data[['token','pos','chunk_tag']]
Y_train = data['target']
#print(Y_train)

# Load the dev data
data = pd.read_csv(dev_path, delimiter='\t', quotechar='|', header=None)
data = data.rename({0: 'token', 1: 'pos', 2: 'chunk_tag', 3: 'target'}, axis=1)
X_dev = data[['token', 'pos', 'chunk_tag']]
Y_dev = data['target']

In [34]:
# add token in the left to features
X_train['token_left'] = extract_left_token(X_train['token'])
X_dev['token_left'] = extract_left_token(X_dev['token'])

X_train['token_right'] = extract_right_token(X_train['token'])
X_dev['token_right'] = extract_right_token(X_dev['token'])

In [35]:
enc = OneHotEncoder(handle_unknown='ignore')
# When an unknown category is encountered during transform,
# the resulting one-hot encoded columns for this feature will be all zeros.

enc.fit_transform(X_train)
X_train_ohe = enc.transform(X_train)
X_dev_ohe = enc.transform(X_dev)

In [36]:
# from sklearn.model_selection import GridSearchCV
# # Hyperparameter tuning
# # https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/

# #Instanciate model
# model = SVC()

# #Hyperparameter Grid
# paras = {'C':[0.1, 1, 10, 100], 'kernel':['linear','poly','rbf','sigmoid','precomputed'],'gamma' :[1, 0.1, 0.01, 0.001]}

# search = GridSearchCV(model, paras, scoring='f1_weighted', verbose = 2 , cv = 5)


In [37]:
model = LinearSVC()

#Hyperparameter Grid
paras = {'C':[0.1, 1, 10, 100]}
search = HalvingGridSearchCV(model, paras, scoring='f1_macro', verbose = 0 , cv = 5).fit(X_train_ohe,Y_train)
print(search.cv_results_)



{'iter': array([0, 0, 0, 0, 1, 1]), 'n_resources': array([ 67873,  67873,  67873,  67873, 203619, 203619]), 'mean_fit_time': array([0.22393556, 0.4952621 , 1.9126832 , 2.21649556, 6.55272303,
       2.17724266]), 'std_fit_time': array([0.01841008, 0.05970046, 0.05964691, 0.06807241, 0.55005732,
       0.20887665]), 'mean_score_time': array([0.05976114, 0.05584283, 0.05445576, 0.05398335, 0.1801002 ,
       0.18398299]), 'std_score_time': array([0.00626723, 0.0009396 , 0.00106347, 0.00035617, 0.0021726 ,
       0.00407399]), 'param_C': masked_array(data=[0.1, 1, 10, 100, 10, 1],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 0.1}, {'C': 1}, {'C': 10}, {'C': 100}, {'C': 10}, {'C': 1}], 'split0_test_score': array([0.73886558, 0.77522824, 0.76341073, 0.74368169, 0.81962612,
       0.82837191]), 'split1_test_score': array([0.76305976, 0.80916526, 0.78791209, 0.75507905, 0.84738695,
       0.85782625]), 'split2

In [38]:
search.best_estimator_

LinearSVC(C=1)

In [39]:
best_model = LinearSVC(C=1)

In [40]:
best_model = LinearSVC(C=1)
best_model.fit(X_train_ohe, Y_train)
Y_pred = best_model.predict(X_dev_ohe)

#evaluation_counts = obtain_counts(Y_dev, Y_pred)
#result = calculate_precision_recall_fscore(evaluation_counts)
#provide_output_tables(result)
#pd.DataFrame(Y_pred).to_csv('output/assignment2SVC_hyt.csv')
print(classification_report(Y_dev, Y_pred, digits = 3))

              precision    recall  f1-score   support

       B-LOC      0.901     0.858     0.879      1837
      B-MISC      0.919     0.804     0.858       922
       B-ORG      0.877     0.789     0.830      1341
       B-PER      0.877     0.901     0.889      1842
       I-LOC      0.894     0.821     0.856       257
      I-MISC      0.821     0.662     0.733       346
       I-ORG      0.845     0.747     0.793       751
       I-PER      0.865     0.951     0.906      1307
           O      0.988     0.995     0.991     42759

    accuracy                          0.970     51362
   macro avg      0.887     0.836     0.859     51362
weighted avg      0.970     0.970     0.970     51362

