In [1]:
import gensim
import numpy as np
import pandas
from sklearn import svm
from sklearn.metrics import classification_report as report
from typing import List, Tuple, Dict
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack
import csv
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import nltk

In [2]:
NUM_TRAINING_EXAMPLES = 0

In [3]:
word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(r"C:\Users\gramp\Downloads\GoogleNews-vectors-negative300.bin.gz", binary = True)

In [4]:
conll_path = 'eng.train'
csv_training_path = 'eng.train.csv'
testpath = '../datasets/NER-test.tsv'


def conll_to_csv(conll_path: str, csv_path: str) -> None:
    with open(conll_path, 'r', encoding='utf-8') as infile, \
        open(csv_path, 'w', newline='', encoding='utf-8') as outfile:
        
        writer = csv.writer(outfile)
        writer.writerow(["Word", "POS", "Chunk", "Tag"])

        for line in infile:
            line = line.strip()
            if not line or line.startswith('-DOCSTART-'):
                continue

            parts = line.split()
            if len(parts) != 4:
                continue

            word, pos, chunk, ner_tag = parts
            
            # adjust NER tags in CoNLL-2003 format to match NER tags in test Data
            if ner_tag == 'B-PER' or ner_tag == 'I-PER':
                ner_tag += 'SON'
            elif ner_tag == 'B-MISC' or ner_tag == 'I-MISC':
                ner_tag = ner_tag[:2] + 'WORK_OF_ART'
            writer.writerow([word, pos, chunk, ner_tag])
            
conll_to_csv(conll_path, csv_training_path)


In [5]:
all_tags_training = set()
all_tags_testing = set()

for row in pandas.read_csv(csv_training_path).itertuples():
    all_tags_training.add(row.Tag)

for row in pandas.read_table(testpath).itertuples():
    all_tags_testing.add(row.BIO_NER_tag)
    

assert all_tags_training == all_tags_testing


counter_training = Counter()
for row in pandas.read_csv(csv_training_path).itertuples():
    counter_training[row.Tag] += 1

print("Training set tag distribution:")
print(f'{"Tag":20} | {"Count":10} | {"Proportion (percent)":10}')
print('-' * 20, '|', '-' * 10, '|', '-' * 10)
for tag in counter_training:
    print(f'{tag:20} | {counter_training[tag]:10} | {100*(counter_training[tag] / len(pandas.read_csv(csv_training_path))):.2f}%')

Training set tag distribution:
Tag                  | Count      | Proportion (percent)
-------------------- | ---------- | ----------
B-ORG                |       6321 | 3.10%
O                    |     169578 | 83.28%
B-WORK_OF_ART        |       3438 | 1.69%
B-PERSON             |       6600 | 3.24%
I-PERSON             |       4528 | 2.22%
B-LOC                |       7140 | 3.51%
I-ORG                |       3704 | 1.82%
I-WORK_OF_ART        |       1155 | 0.57%
I-LOC                |       1157 | 0.57%


In [6]:
def extract_valid_tokens_from_df(
         df: pandas.DataFrame,
         test = False
      ) -> List[Tuple[str, str]]:
    
    '''Extracts valid tokens from a DataFrame and returns them as a list of tuples.
    
    Args:
        df: A pandas DataFrame containing token and NER tag columns.
        test: A boolean indicating whether the data is test data.
        
    Returns:
        A list of tuples in the form [tokens, NER tag].
    '''
    valid_tokens = []

    tokenword = 'Word'
    tagword = 'Tag'
    if test:
        tokenword = 'token'
        tagword = 'BIO_NER_tag'

    for row  in df.iterrows():
        token = row[1][tokenword]   
        ne_label = row[1][tagword]
        if token != 'DOCSTART':
            valid_tokens.append((token, ne_label))
        
        if not test:
            if len(valid_tokens) >= NUM_TRAINING_EXAMPLES and NUM_TRAINING_EXAMPLES > 0:
                break
            
    return valid_tokens

def embeddings_from_valid_tokens(
            valid_tokens: List[Tuple[str, str]], 
            model: gensim.models.KeyedVectors,
        ) -> Tuple[np.ndarray, np.ndarray]:
    ''' Extracts embeddings and labels from a list of valid tokens.
    
    Args:
        valid_tokens: A list of tuples in the form [token, NER tag].
        model: A gensim word embedding model.
        
    Returns:
        A tuple containing a numpy array of embeddings and a numpy array of labels.
    '''

    num_tokens = len(valid_tokens)
    
    # if the token is not in the model, we use a zero vector
    input_vectors = np.zeros((num_tokens, 300))
    labels = np.empty(num_tokens, dtype=object)

    for i, (token, ne_label) in enumerate(valid_tokens):
        if token in model:
            input_vectors[i] = model[token]
        
        labels[i] = ne_label
        
    return input_vectors, labels
 
def token2features(embs: np.ndarray, i: int) -> np.ndarray:
    '''Extracts select embeddings as features from a token and its context.
    
    Args:
        embs: A numpy array of embeddings.
        i: An integer representing the index of the token in the embeddings array.
        
    Returns: A numpy array of embeddings. shape: (3, 300)
    '''
    
    # we basically just layer embedding [i-1], embedding [i], and embedding [i+1] on top of each other
    previous_token = embs[i-1] if i > 0 else np.zeros(300)
    current_token = embs[i]
    next_token = embs[i+1] if i < len(embs) - 1 else np.zeros(300)

    features = np.empty((3, 300))

    features[0] = previous_token
    features[1] = current_token
    features[2] = next_token
    return features
 
def build_feature_dict(pos_tags: List[str], tokens: List[str], i: int) -> Dict[str, bool]:
    """Return a dictionary of discrete features for a single token.
    
    Args:
        token: A string representing a single token.
        
    Returns:
        A dictionary of discrete features.
    """
    features = {}
    
    token = tokens[i]
    
    prev_tag = pos_tags[i-1] if i > 0 else ''
    curr_tag = pos_tags[i]
    next_tag = pos_tags[i+1] if i < len(pos_tags) - 1 else ''
    
    features['is_number'] = token.isnumeric()
    features['is_upper'] = token.isupper()
    
    shape = []
    for char in token:
        if char.isdigit():
            shape.append('9')
        elif char.isupper():
            shape.append('X')
        elif char.islower():
            shape.append('x')
        else:
            shape.append(char)
            
    features['shape=' + ''.join(shape)] = True
    
    features['prev_tag=' + prev_tag] = True
    features['curr_tag=' + curr_tag] = True
    features['next_tag=' + next_tag] = True
        
    return features


def extract_from_csv(path: str, vectorizer: DictVectorizer, scaler: StandardScaler, test: bool = False, mode: str = 'csv'):
    '''Extracts features and labels from a CSV file.
    
    Args:
        path: A string representing the path to the CSV file.
        vectorizer: A DictVectorizer object.
        scaler: A StandardScaler object.
        test: A boolean indicating whether the data is test data.
        mode: A string indicating the file format. (csv or tsv)
        
    Returns:
        A tuple containing a sparse matrix of features and a numpy array of labels.
    '''
    
    # Comments for clarity of each section.
    # Each of these should ideally be functions, but I'm lazy
    
    # raw is a pandas dataframe
    if mode == 'csv':
        raw = pandas.read_csv(path, on_bad_lines='warn', encoding = "ISO-8859-1")
    else:
        raw = pandas.read_table(path, on_bad_lines='warn', encoding = "ISO-8859-1")
        
    
    # get valid tokens (i.e. tokens that are not DOCSTART, up to NUM_TRAINING_EXAMPLES)
    valid_tokens = extract_valid_tokens_from_df(raw, test)
    
    # get discrete features
    tokens = [str(token[0]) for token in valid_tokens if token[0]]
    pos_tags = []
    for token in tokens:
        try:
            pos_tags.append(nltk.pos_tag([token])[0][1])
        except:
            pos_tags.append('')
    

    train_discrete_features = [build_feature_dict(pos_tags, tokens, i) for i, _ in enumerate(pos_tags)]
    if test:
        X_discrete = vectorizer.transform(train_discrete_features)
    else:
        X_discrete = vectorizer.fit_transform(train_discrete_features)
    
    # get embeddings and embedding features
    vectors, labels = embeddings_from_valid_tokens(valid_tokens, word_embedding_model)
    embedding_features = np.zeros((len(vectors), 900))
    for i in range(len(vectors)):
        ctx = token2features(vectors, i)
        embedding_features[i] = ctx.reshape(-1)
    
    # scale the embedding features from (-1, 1) to (0, 1)
    if test:
        embedding_features_scaled = scaler.transform(embedding_features)
    else:
        embedding_features_scaled = scaler.fit_transform(embedding_features)
        
    # convert to sparse matrix
    embedding_features_scaled = csr_matrix(embedding_features_scaled)
    full_features = hstack([embedding_features_scaled, X_discrete])
    
    return full_features, labels


In [7]:
vectorizer = DictVectorizer()
scaler = StandardScaler()
train_features, train_labels = extract_from_csv(csv_training_path, vectorizer, scaler)

In [8]:
print(train_features.shape)

(203621, 1796)


In [9]:
svm_classifier = svm.LinearSVC(class_weight='balanced', C=0.1, max_iter=1000)
svm_classifier.fit(train_features, train_labels)



LinearSVC(C=0.1, class_weight='balanced')

In [10]:
test_features, test_labels = extract_from_csv(testpath, vectorizer, scaler, test=True, mode='tsv')

In [11]:
print(test_features.shape)
print(test_labels.shape)

(237, 1796)
(237,)


In [41]:
predictions = svm_classifier.predict(test_features)

In [42]:
eval_report = report(test_labels, predictions)
print(eval_report)

               precision    recall  f1-score   support

        B-LOC       0.83      0.71      0.77         7
        B-ORG       0.33      1.00      0.50         3
     B-PERSON       0.67      0.73      0.70        11
B-WORK_OF_ART       0.25      0.11      0.15         9
        I-LOC       0.25      1.00      0.40         1
        I-ORG       0.40      1.00      0.57         2
     I-PERSON       0.88      0.88      0.88         8
I-WORK_OF_ART       0.00      0.00      0.00        10
            O       0.96      0.97      0.97       186

     accuracy                           0.88       237
    macro avg       0.51      0.71      0.55       237
 weighted avg       0.85      0.88      0.86       237



  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
import matplotlib.pyplot as plt

def save_report_as_table(y_true, y_pred, output_path="classification_report.png"):
    # Convert classification report to DataFrame
    report_dict = report(y_true, y_pred, output_dict=True)
    df = pandas.DataFrame(report_dict).transpose().round(2)
    df = df[['precision', 'recall', 'f1-score', 'support']]

    # Set up figure
    fig, ax = plt.subplots(figsize=(10, len(df) * 0.5 + 1.5))
    ax.axis('off')

    # Create table
    table = ax.table(cellText=df.values,
                     colLabels=df.columns,
                     rowLabels=df.index,
                     loc='center',
                     cellLoc='center',
                     rowLoc='center')

    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.1, 1.4)

    # Add title closer to the table using fig.suptitle instead of ax.set_title
    fig.suptitle("Classification Report", fontsize=14, fontweight='bold', y=0.95)

    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()


save_report_as_table(test_labels, predictions, output_path="classification_report.png")

In [44]:
wwa_test_labels = []
wwa_predictions = []
for i in range(len(test_labels)):
    if test_labels[i][2:] != 'WORK_OF_ART' and predictions[i][2:] != 'WORK_OF_ART':
        wwa_test_labels.append(test_labels[i])
        wwa_predictions.append(predictions[i])


wwa_eval_report = report(wwa_test_labels, wwa_predictions)
print(wwa_eval_report)

              precision    recall  f1-score   support

       B-LOC       0.83      0.71      0.77         7
       B-ORG       0.60      1.00      0.75         3
    B-PERSON       0.89      0.80      0.84        10
       I-LOC       0.33      1.00      0.50         1
       I-ORG       0.50      1.00      0.67         2
    I-PERSON       1.00      0.88      0.93         8
           O       0.99      0.98      0.99       185

    accuracy                           0.96       216
   macro avg       0.74      0.91      0.78       216
weighted avg       0.97      0.96      0.96       216



In [45]:
NER_test_labels = []
NER_predictions = []
for i in range(len(test_labels)):
    NER_test_labels.append(test_labels[i] if test_labels[i] == 'O' else "NE")
    NER_predictions.append(predictions[i] if predictions[i] == 'O' else "NE")


NER_eval_report = report(NER_test_labels, NER_predictions)
print(NER_eval_report)

save_report_as_table(NER_test_labels, NER_predictions, output_path="NER_only.png")

              precision    recall  f1-score   support

          NE       0.90      0.84      0.87        51
           O       0.96      0.97      0.97       186

    accuracy                           0.95       237
   macro avg       0.93      0.91      0.92       237
weighted avg       0.94      0.95      0.94       237



In [50]:
tokens = extract_valid_tokens_from_df(pandas.read_table(testpath, on_bad_lines='warn', encoding = "ISO-8859-1"), test=True)
test_tokens = [token[0] for token in tokens if token[0]]
for i in range(len(test_labels)):
    if test_labels[i] != predictions[i]:
        print(f"True: {test_labels[i]}, Pred: {predictions[i]}, Context: {test_tokens[i-2 :i+3]}")

True: B-LOC, Pred: B-ORG, Context: ['moved', 'to', 'Barcelona', 'last', 'summer']
True: B-LOC, Pred: I-ORG, Context: ['at', 'Wembley', 'Stadium', 'was', 'absolutely']
True: O, Pred: B-ORG, Context: ['is', 'the', 'GOAT', '.', 'I']
True: B-WORK_OF_ART, Pred: O, Context: ['finished', 'reading', 'The', 'Catcher', 'in']
True: I-WORK_OF_ART, Pred: I-LOC, Context: ['reading', 'The', 'Catcher', 'in', 'the']
True: I-WORK_OF_ART, Pred: O, Context: ['The', 'Catcher', 'in', 'the', 'Rye']
True: I-WORK_OF_ART, Pred: O, Context: ['Catcher', 'in', 'the', 'Rye', 'and']
True: I-WORK_OF_ART, Pred: B-ORG, Context: ['in', 'the', 'Rye', 'and', "couldn't"]
True: B-PERSON, Pred: B-WORK_OF_ART, Context: ['spotted', 'a', 'Banksy', 'mural', 'whilst']
True: B-WORK_OF_ART, Pred: B-PERSON, Context: ['!', 'The', 'Harry', 'Potter', 'series']
True: I-WORK_OF_ART, Pred: I-PERSON, Context: ['The', 'Harry', 'Potter', 'series', 'will']
True: O, Pred: I-LOC, Context: ['be', 'my', 'go-to', 'comfort', 'read']
True: O, Pred: 