<h1><center> Natual Language Processing </center></h1> 
<h3><center> CRF sequence tagging for Movie Querries </center></h3> 
<h5><center> Khawaja Usman Tahir</center></h5> 

In [None]:
# You may need to run this first- uncomment if so
# !pip install python-crfsuite

In [284]:
import os
import sys

import pandas as pd

from copy import deepcopy
from collections import Counter
from nltk.tag import CRFTagger

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt
import numpy as np
from IPython.display import display_html 


import re
import unicodedata

In [285]:
def get_raw_data_from_bio_file(fpath):
   
    f = open(fpath)
    data = []  # the data, a list of lists of (word, tag) tuples
    current_sent = []  # data for current sentence/example
    for line in f:
        if line == "\n":  # each instance has a blank line separating it from next one
            # solution
            data.append(current_sent)
            current_sent = []
            continue
        line_data = line.strip("\n").split("\t")
        current_sent.append((line_data[1], line_data[0]))
    f.close()
    return data

In [286]:
raw_training_data = get_raw_data_from_bio_file("trivia10k13train.bio.txt") 

In [287]:
# have a look at the first example
print(raw_training_data[0])

[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')]


In [288]:
print(len(raw_training_data), "instances")
print(sum([len(sent) for sent in raw_training_data]), "words")

7816 instances
158823 words


In [289]:
def preProcess(example):

    preprocessed_example = example  # trivial- no preprocessing
    return preprocessed_example

In [290]:
training_data = [preProcess(example) for example in raw_training_data]

In [291]:
# check the effect of pre-processing
print(training_data[0])

[('steve', 'B-Actor'), ('mcqueen', 'I-Actor'), ('provided', 'O'), ('a', 'O'), ('thrilling', 'B-Plot'), ('motorcycle', 'I-Plot'), ('chase', 'I-Plot'), ('in', 'I-Plot'), ('this', 'I-Plot'), ('greatest', 'B-Opinion'), ('of', 'I-Opinion'), ('all', 'I-Opinion'), ('ww', 'B-Plot'), ('2', 'I-Plot'), ('prison', 'I-Plot'), ('escape', 'I-Plot'), ('movies', 'I-Plot')]


In [293]:
_pattern = re.compile(r"\d")  # to recognize numbers/digits


# This is the 'out-of-the-box' get_features function from the nltk CRF tagger
def get_features(tokens, idx):

    token = tokens[idx]
    feature_list = []

    if not token:
        return feature_list
    
    # Capitalization
    if token[0].isupper():
        feature_list.append("CAPITALIZATION")

    # Number
    if re.search(_pattern, token) is not None:
        feature_list.append("HAS_NUM")

    # Punctuation
    punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
    if all(unicodedata.category(x) in punc_cat for x in token):
        feature_list.append("PUNCTUATION")

    # Suffix up to length 3
    if len(token) > 1:
        feature_list.append("SUF_" + token[-1:])
    if len(token) > 2:
        feature_list.append("SUF_" + token[-2:])
    if len(token) > 3:
        feature_list.append("SUF_" + token[-3:])
        
    feature_list.append("WORD_" + token)
    #print(feature_list)
    return feature_list

Training the tagger

In [294]:
# Train the CRF BIO-tag tagger
TAGGER_PATH = "crf_nlu.tagger"  # path to the tagger- it will save/access the model from here
ct = CRFTagger(feature_func=get_features)  # initialize tagger with get_features function

print("training tagger...")
ct.train(training_data, TAGGER_PATH)
print("done")

training tagger...
done


# 1. Split the training data into 80% training, 20% development set (5 marks)

Splitting the training data

In [297]:
spl = int(0.8 * len(training_data))
train_data = training_data[:spl]
test_data = training_data[spl:]

In [298]:
print("Total Samples: ", len(training_data))
print("Train Samples: ", len(train_data))
print("Test Samples: ", len(test_data))



Total Samples:  7816
Train Samples:  6252
Test Samples:  1564


# 2. Error analysis 1: False positives (5 marks)

Printing sentences

In [299]:
print("training tagger...")

false_pos_preds = []
false_pos_y_test = []

for sent in test_data:
    
    false_pos_sent_preds = [x[1] for x in ct.tag([s[0] for s in sent])]
    false_pos_sent_true = [s[1] for s in sent]
    
    # looping 
    for i in range (len(false_pos_sent_preds)):
        if false_pos_sent_preds[i] == false_pos_sent_true[i]:
            continue
        print('{1}:\nsentence: {0} \n'.format(sent, false_pos_sent_preds[i]))

        false_pos_preds.extend(false_pos_sent_preds)
        false_pos_y_test.extend(false_pos_sent_true)


print("done")

training tagger...
I-Plot:
sentence: [('omar', 'B-Actor'), ('sharif', 'I-Actor'), ('is', 'O'), ('a', 'B-Plot'), ('married', 'I-Plot'), ('man', 'I-Plot'), ('who', 'I-Plot'), ('falls', 'I-Plot'), ('in', 'I-Plot'), ('love', 'I-Plot'), ('with', 'I-Plot'), ('julie', 'B-Actor'), ('christie', 'I-Actor'), ('during', 'B-Plot'), ('the', 'I-Plot'), ('bolshevik', 'I-Plot'), ('revolution', 'I-Plot'), ('in', 'O'), ('this', 'O'), ('1965', 'B-Year'), ('oscar', 'B-Award'), ('winner', 'O')] 

I-Plot:
sentence: [('omar', 'B-Actor'), ('sharif', 'I-Actor'), ('is', 'O'), ('a', 'B-Plot'), ('married', 'I-Plot'), ('man', 'I-Plot'), ('who', 'I-Plot'), ('falls', 'I-Plot'), ('in', 'I-Plot'), ('love', 'I-Plot'), ('with', 'I-Plot'), ('julie', 'B-Actor'), ('christie', 'I-Actor'), ('during', 'B-Plot'), ('the', 'I-Plot'), ('bolshevik', 'I-Plot'), ('revolution', 'I-Plot'), ('in', 'O'), ('this', 'O'), ('1965', 'B-Year'), ('oscar', 'B-Award'), ('winner', 'O')] 

I-Plot:
sentence: [('omar', 'B-Actor'), ('sharif', 'I-Actor


I-Plot:
sentence: [('what', 'O'), ('1967', 'B-Year'), ('musical', 'B-Genre'), ('comedy', 'I-Genre'), ('follows', 'O'), ('the', 'B-Plot'), ('adventures', 'I-Plot'), ('of', 'I-Plot'), ('the', 'I-Plot'), ('beatles', 'I-Plot'), ('on', 'I-Plot'), ('a', 'I-Plot'), ('special', 'I-Plot'), ('charter', 'I-Plot'), ('bus', 'I-Plot')] 

I-Plot:
sentence: [('what', 'O'), ('1967', 'B-Year'), ('romantic', 'B-Genre'), ('comedy', 'I-Genre'), ('depicts', 'O'), ('robert', 'B-Actor'), ('redford', 'I-Actor'), ('as', 'O'), ('a', 'B-Plot'), ('young', 'I-Plot'), ('lawyer', 'I-Plot'), ('and', 'O'), ('jane', 'B-Actor'), ('fonda', 'I-Actor'), ('as', 'O'), ('his', 'B-Plot'), ('vivacious', 'I-Plot'), ('wife', 'I-Plot')] 

I-Plot:
sentence: [('what', 'O'), ('1967', 'B-Year'), ('romantic', 'B-Genre'), ('comedy', 'I-Genre'), ('depicts', 'O'), ('robert', 'B-Actor'), ('redford', 'I-Actor'), ('as', 'O'), ('a', 'B-Plot'), ('young', 'I-Plot'), ('lawyer', 'I-Plot'), ('and', 'O'), ('jane', 'B-Actor'), ('fonda', 'I-Actor'), 


O:
sentence: [('what', 'O'), ('is', 'O'), ('the', 'O'), ('movie', 'O'), ('about', 'O'), ('1800', 'B-Plot'), ('s', 'I-Plot'), ('magic', 'I-Plot'), ('entertainment', 'I-Plot'), ('shows', 'I-Plot'), ('that', 'O'), ('is', 'O'), ('not', 'O'), ('the', 'O'), ('prestige', 'O'), ('with', 'O'), ('edward', 'B-Actor'), ('norton', 'I-Actor')] 

O:
sentence: [('what', 'O'), ('is', 'O'), ('the', 'O'), ('movie', 'O'), ('about', 'O'), ('1800', 'B-Plot'), ('s', 'I-Plot'), ('magic', 'I-Plot'), ('entertainment', 'I-Plot'), ('shows', 'I-Plot'), ('that', 'O'), ('is', 'O'), ('not', 'O'), ('the', 'O'), ('prestige', 'O'), ('with', 'O'), ('edward', 'B-Actor'), ('norton', 'I-Actor')] 

O:
sentence: [('what', 'O'), ('is', 'O'), ('the', 'O'), ('movie', 'O'), ('about', 'O'), ('1800', 'B-Plot'), ('s', 'I-Plot'), ('magic', 'I-Plot'), ('entertainment', 'I-Plot'), ('shows', 'I-Plot'), ('that', 'O'), ('is', 'O'), ('not', 'O'), ('the', 'O'), ('prestige', 'O'), ('with', 'O'), ('edward', 'B-Actor'), ('norton', 'I-Actor')]

Finding lowest 5 precision classes

In [300]:
class_report = classification_report(false_pos_y_test, false_pos_preds, output_dict = True)
print(classification_report(false_pos_y_test, false_pos_preds))
print(precision_recall_fscore_support(false_pos_y_test, false_pos_preds, average = "macro",))

# creating a dataframe
df = pd.DataFrame(class_report).transpose()
false_positives = df.sort_values(by="precision", ascending=True).head(5)["precision"]
print(false_positives)

                  precision    recall  f1-score   support

         B-Actor       0.80      0.83      0.82      2648
         B-Award       0.62      0.45      0.52       212
B-Character_Name       0.79      0.10      0.18      1027
      B-Director       0.86      0.69      0.76      1211
         B-Genre       0.81      0.72      0.76      2050
       B-Opinion       0.43      0.21      0.28       747
        B-Origin       0.37      0.24      0.29       607
          B-Plot       0.22      0.20      0.21      3256
         B-Quote       0.46      0.09      0.15       141
  B-Relationship       0.79      0.38      0.52       530
    B-Soundtrack       0.44      0.07      0.12        99
          B-Year       0.96      0.99      0.98      1622
         I-Actor       0.80      0.83      0.82      3231
         I-Award       0.82      0.75      0.78       559
I-Character_Name       0.80      0.09      0.16       858
      I-Director       0.85      0.66      0.74      1201
         I-Ge

# 3. Error analysis 2: False negatives (5 marks)


In [301]:
print("training tagger...")

false_negatives = df.sort_values(by="recall", ascending=True).head(5)["recall"]

categories = false_negatives.index.tolist()

false_neg_preds = []
false_neg_y_test = []

for sent in test_data:
    
    false_neg_sent_preds = [x[1] for x in ct.tag([s[0] for s in sent])]
    false_neg_sent_true = [s[1] for s in sent]
    
    # looping 
    for i in range (len(false_neg_sent_preds)):
        if false_neg_sent_preds[i] not in false_neg_sent_true[i]:
            continue
        if false_neg_sent_preds[i] == false_neg_sent_true[i]:
            continue
        print('{1}:\nsentence: {0} \n'.format(sent, false_neg_sent_preds[i]))
        
    false_neg_preds.extend(false_neg_sent_preds)
    false_neg_y_test.extend(false_neg_sent_true)



print("done")

training tagger...
O:
sentence: [('one', 'O'), ('of', 'O'), ('the', 'O'), ('most', 'B-Opinion'), ('memorable', 'I-Opinion'), ('scenes', 'I-Opinion'), ('from', 'O'), ('this', 'O'), ('film', 'O'), ('involve', 'O'), ('a', 'B-Plot'), ('boy', 'I-Plot'), ('riding', 'I-Plot'), ('his', 'I-Plot'), ('bike', 'I-Plot'), ('through', 'I-Plot'), ('the', 'I-Plot'), ('sky', 'I-Plot')] 

O:
sentence: [('one', 'O'), ('of', 'O'), ('the', 'O'), ('most', 'B-Opinion'), ('memorable', 'I-Opinion'), ('scenes', 'I-Opinion'), ('from', 'O'), ('this', 'O'), ('film', 'O'), ('involve', 'O'), ('a', 'B-Plot'), ('boy', 'I-Plot'), ('riding', 'I-Plot'), ('his', 'I-Plot'), ('bike', 'I-Plot'), ('through', 'I-Plot'), ('the', 'I-Plot'), ('sky', 'I-Plot')] 

O:
sentence: [('one', 'O'), ('of', 'O'), ('the', 'O'), ('most', 'B-Opinion'), ('memorable', 'I-Opinion'), ('scenes', 'I-Opinion'), ('from', 'O'), ('this', 'O'), ('film', 'O'), ('involve', 'O'), ('a', 'B-Plot'), ('boy', 'I-Plot'), ('riding', 'I-Plot'), ('his', 'I-Plot'), ('


O:
sentence: [('which', 'O'), ('film', 'O'), ('was', 'O'), ('quirky', 'B-Opinion'), ('director', 'O'), ('wes', 'B-Director'), ('anderson', 'I-Director'), ('s', 'O'), ('foray', 'O'), ('into', 'O'), ('the', 'O'), ('world', 'O'), ('of', 'O'), ('author', 'B-Origin'), ('roald', 'I-Origin'), ('dahl', 'I-Origin'), ('and', 'O'), ('hand', 'O'), ('crafted', 'O'), ('stop', 'O'), ('motion', 'O'), ('filmmaking', 'O')] 

O:
sentence: [('which', 'O'), ('iconic', 'B-Opinion'), ('detective', 'B-Genre'), ('movie', 'O'), ('has', 'O'), ('a', 'O'), ('young', 'O'), ('clint', 'B-Actor'), ('eastwood', 'I-Actor'), ('uttering', 'O'), ('the', 'O'), ('words', 'O'), ('make', 'B-Quote'), ('my', 'I-Quote'), ('day', 'I-Quote')] 

O:
sentence: [('which', 'O'), ('intelligent', 'B-Opinion'), ('crime', 'B-Genre'), ('film', 'I-Genre'), ('earned', 'O'), ('kevin', 'B-Actor'), ('spacey', 'I-Actor'), ('high', 'O'), ('praise', 'O'), ('and', 'O'), ('an', 'O'), ('academy', 'B-Award'), ('award', 'I-Award'), ('for', 'I-Award'), (

Finding classes with 5 lowest recall values

In [302]:
print(false_negatives)

I-Soundtrack        0.069705
B-Soundtrack        0.070707
I-Character_Name    0.090909
B-Quote             0.092199
I-Opinion           0.092843
Name: recall, dtype: float64


# 4. Incorporating POS tags as features (15 marks)

Modifying the Preprocess function

In [303]:
def updated_preprocess(example):
    temp_words = []
    
    for word, label in example:
        temp_words.append(word)
        
    # a postagger for use in exercises
    posttagger = CRFTagger()
    posttagger.set_model_file("crf_pos.tagger")
    pos_tag_list = posttagger = posttagger.tag(temp_words)
    
    # Concatenate word and POS tage with @
    
    concat = []
    for word, label in pos_tag_list:
        concat.append(word + "@" + label)
    
    # Concatenate the POS tag
    
    end_concat = []
    for word, label in example:
        end_concat.append(label)
        
    poss = list(zip(concat, end_concat))
    
    preprocessed_example = poss
    
    return preprocessed_example
        
    
    

Rerunning the training and testing and splitting again

In [304]:
updated_training_data = [updated_preprocess(example) for example in raw_training_data]

updated_spl = int(0.8 * len(updated_training_data))
updated_train_data = updated_training_data[:updated_spl]
updated_test_data = updated_training_data[updated_spl:]

copied and modified get_features

In [305]:
_pattern = re.compile(r"\d")  # to recognize numbers/digits


# This is the 'out-of-the-box' get_features function from the nltk CRF tagger
def updated_get_features(tokens, idx):

    token = tokens[idx]
    spl_tok = token.split("@")
    if len(spl_tok) == 1:
        print(spl_tok)
    
    feature_list = []

    if not token:
        return feature_list
    
    # Capitalization
    if spl_tok[0].isupper():
        feature_list.append("CAPITALIZATION")

    # Number
    if re.search(_pattern, spl_tok[0]) is not None:
        feature_list.append("HAS_NUM")

    # Punctuation
    punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
    if all(unicodedata.category(x) in punc_cat for x in token):
        feature_list.append("PUNCTUATION")

    # Suffix up to length 3
    if len(spl_tok[0]) > 1:
        feature_list.append("SUF_" + spl_tok[0][-1:])
    if len(spl_tok[0]) > 2:
        feature_list.append("SUF_" + spl_tok[0][-2:])
    if len(spl_tok[0]) > 3:
        feature_list.append("SUF_" + spl_tok[0][-3:])
        
    feature_list.append("WORD_" + spl_tok[0])
    feature_list.append("POSTAG_" + spl_tok[1])
    
    #print(feature_list)
    return feature_list




Train new tagger with POS tags

In [306]:
# Train the CRF BIO-tag tagger
TAGGER_PATH = "crf_nlu.tagger"  # path to the tagger- it will save/access the model from here
updated_ct = CRFTagger(feature_func=updated_get_features)  # initialize tagger with get_features function
# print(updated_ct)

print("training tagger...")
updated_ct.train(updated_training_data, TAGGER_PATH)
print("done")

training tagger...
done


test the new tagger

In [307]:
print("testing tagger...")
updated_preds = []
updated_y_test = []
for sent in updated_test_data:
    updated_sent_preds = [x[1] for x in updated_ct.tag([s[0] for s in sent])]
    updated_sent_true = [s[1] for s in sent]
    
    for i in range(len(updated_sent_preds)):
        if updated_sent_preds[i] == updated_sent_true[i]:
            continue
    
    updated_preds.extend(updated_sent_preds)
    updated_y_test.extend(updated_sent_true)
print("done")

testing tagger...
done


Classification report and comparison

In [308]:

updated_class_report = classification_report(updated_y_test, updated_preds, output_dict=True, zero_division=1)
print(classification_report(updated_y_test, updated_preds, zero_division=1))
print(precision_recall_fscore_support(updated_y_test, updated_preds, average = "macro", zero_division=1))

                  precision    recall  f1-score   support

         B-Actor       0.91      0.94      0.93      1231
         B-Award       0.80      0.68      0.73        75
B-Character_Name       0.86      0.24      0.37       212
      B-Director       0.92      0.84      0.88       472
         B-Genre       0.91      0.86      0.88       805
       B-Opinion       0.64      0.38      0.47       160
        B-Origin       0.68      0.53      0.60       159
          B-Plot       0.57      0.52      0.54      1221
         B-Quote       0.83      0.42      0.56        24
  B-Relationship       0.87      0.58      0.70       125
    B-Soundtrack       0.50      0.15      0.24        13
          B-Year       0.98      1.00      0.99       703
         I-Actor       0.92      0.95      0.94      1731
         I-Award       0.85      0.86      0.86       202
I-Character_Name       0.89      0.25      0.39       165
      I-Director       0.92      0.84      0.88       456
         I-Ge

New dataframe to store the report

In [309]:
updated_df = pd.DataFrame(updated_class_report).transpose()
print(updated_df.sort_values(by="precision", ascending = True).head(5)["precision"])
print(updated_df.sort_values(by="recall", ascending = True).head(5)["recall"])

I-Opinion       0.265823
B-Soundtrack    0.500000
B-Plot          0.566105
B-Opinion       0.638298
B-Origin        0.680000
Name: precision, dtype: float64
B-Soundtrack        0.153846
I-Soundtrack        0.195122
B-Character_Name    0.235849
I-Character_Name    0.248485
I-Opinion           0.250000
Name: recall, dtype: float64


<div class = "alert alert-block alert-warning">
    <b>Warning</b> The code below for comparison is taken from <b><a> https://www.thiscodeworks.com/display-two-dataframes-side-by-side-html-pandas-python-visualization/619285b06f88d700157a60d7></a></b> just for comparison
    
</div>

In [310]:
df1_styler = df.style.set_table_attributes("style='display:inline'").set_caption('No POS Tags')
df2_styler = updated_df.style.set_table_attributes("style='display:inline'").set_caption('POS Tags')
 
display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,precision,recall,f1-score,support
B-Actor,0.80418,0.828172,0.816,2648.0
B-Award,0.615385,0.45283,0.521739,212.0
B-Character_Name,0.785185,0.103213,0.182444,1027.0
B-Director,0.857438,0.685384,0.761817,1211.0
B-Genre,0.813671,0.72,0.763975,2050.0
B-Opinion,0.432584,0.206158,0.279238,747.0
B-Origin,0.365239,0.23888,0.288845,607.0
B-Plot,0.215287,0.198096,0.206334,3256.0
B-Quote,0.464286,0.092199,0.153846,141.0
B-Relationship,0.790698,0.384906,0.517766,530.0

Unnamed: 0,precision,recall,f1-score,support
B-Actor,0.914354,0.936637,0.925361,1231.0
B-Award,0.796875,0.68,0.733813,75.0
B-Character_Name,0.862069,0.235849,0.37037,212.0
B-Director,0.920746,0.836864,0.876804,472.0
B-Genre,0.906579,0.855901,0.880511,805.0
B-Opinion,0.638298,0.375,0.472441,160.0
B-Origin,0.68,0.534591,0.598592,159.0
B-Plot,0.566105,0.522523,0.543441,1221.0
B-Quote,0.833333,0.416667,0.555556,24.0
B-Relationship,0.869048,0.584,0.698565,125.0


The comparison demonstrates that POS Tags clearly enhance the precision, recall and f1-score of across the data.

# 5. Feature experimentation and other optimization for optimal macro average (30 marks).

Creating new tags for features as recommended in hints

In [311]:
prev_pos_tag_list = ["PREV_WORD_", "PREV_POSTAG_", "TWICE_PREV_WORD_", "TWICE_PREV_POSTAG_"]
next_pos_tag_list = ["NEXT_WORD_", "NEXT_POSTAG_", "TWICE_NEXT_WORD_", "TWICE_NEXT_POSTAG_"]

Copied the updated_get_features and modify it for enhancements

In [312]:
_pattern = re.compile(r"\d")  # to recognize numbers/digits


# This is the 'out-of-the-box' get_features function from the nltk CRF tagger
def last_get_features(tokens, idx):

    token = tokens[idx]
    spl_tok = token.split("@")
    if len(spl_tok) == 1:
        print(spl_tok)
    
    feature_list = []

    if not token:
        return feature_list
    
    # Capitalization
    if spl_tok[0].isupper():
        feature_list.append("CAPITALIZATION")

    # Number
    if re.search(_pattern, spl_tok[0]) is not None:
        feature_list.append("HAS_NUM")

    # Punctuation
    punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"])
    if all(unicodedata.category(x) in punc_cat for x in token):
        feature_list.append("PUNCTUATION")
        
        
    # ADDITION OF PREFIX
    for i in range (0,2):
        # Suffix and prefix
        if len(spl_tok[0]) > i:
            feature_list.append("SUF_" + spl_tok[0][-i:])
        if len(spl_tok[0]) > i:
            feature_list.append("PRE_" + spl_tok[0][:i])

    # Other features
    
    for i in range (0,2):
        if i == 0 & idx > i:
            prev_spl_tkn_new = tokens[idx - 1].split("@")
            feature_list.append(prev_pos_tag_list[0] + prev_spl_tkn_new[0])
            feature_list.append(prev_pos_tag_list[1] + prev_spl_tkn_new[1])
            
            next_spl_tkn_new = tokens[idx + 1].split("@")
            feature_list.append(next_pos_tag_list[0] + next_spl_tkn_new[0])
            feature_list.append(next_pos_tag_list[1] + next_spl_tkn_new[1])
            
        if i == 1 & idx > i:
            prev_spl_tkn_new = tokens[idx - 2].split("@")
            feature_list.append(prev_pos_tag_list[2] + prev_spl_tkn_new[0])
            feature_list.append(prev_pos_tag_list[3] + prev_spl_tkn_new[1])
            
            next_spl_tkn_new = tokens[idx + 2].split("@")
            feature_list.append(next_pos_tag_list[2] + next_spl_tkn_new[0])
            feature_list.append(next_pos_tag_list[3] + next_spl_tkn_new[1])
      
        
    feature_list.append("WORD_" + spl_tok[0])
    feature_list.append("POSTAG_" + spl_tok[1])
    
    #print(feature_list)
    return feature_list


Load raw data, preprocess it and train tagger, split data

In [313]:
# Final results on original test data, using all training data, with best settings from dev set:

# prepare the test data:
raw_test_data = get_raw_data_from_bio_file("trivia10k13test.bio.txt") 
test_data = [updated_preprocess(example) for example in raw_test_data]


# Train the CRF BIO-tag tagger on all training data
TAGGER_PATH = "crf_last.tagger"  # path to the tagger- it will save/access the model from here
last_ct = CRFTagger(feature_func=last_get_features, training_opt={"feature.minfreq":2, "c2": 0.1})  # initialize tagger with get_features function

print("training tagger...")
last_ct.train(updated_training_data, TAGGER_PATH)
print("done")

# load tagger from saved file
# last_ct = CRFTagger(feature_func=last_get_features, training_opt={"feature.minfreq":3, "c2": 0.1})  # initialize tagger
last_ct.set_model_file(TAGGER_PATH)  # load model from file


spl = int(0.8 * len(training_data))
last_train_data = training_data[:spl]
last_test_data = training_data[spl:]


training tagger...
done


In [314]:
print("testing tagger on test data...")
last_preds = []
last_y_test = []
for sent in updated_test_data:
    last_sent_preds = [x[1] for x in last_ct.tag([s[0] for s in sent])]
    last_sent_true = [s[1] for s in sent]
    
    for i in range(len(last_sent_preds)):
        continue
    last_preds.extend(last_sent_preds)
    last_y_test.extend(last_sent_true)
print("done")

testing tagger on test data...
done


Last report

In [315]:
last_class_report = classification_report(last_y_test, last_preds, output_dict= True)
print(classification_report(last_y_test, last_preds))
print(precision_recall_fscore_support(last_y_test, last_preds, average='macro'))

                  precision    recall  f1-score   support

         B-Actor       0.95      0.96      0.96      1231
         B-Award       0.79      0.71      0.75        75
B-Character_Name       0.89      0.51      0.65       212
      B-Director       0.92      0.90      0.91       472
         B-Genre       0.92      0.89      0.91       805
       B-Opinion       0.68      0.50      0.58       160
        B-Origin       0.72      0.63      0.67       159
          B-Plot       0.61      0.56      0.58      1221
         B-Quote       0.78      0.58      0.67        24
  B-Relationship       0.84      0.61      0.70       125
    B-Soundtrack       0.40      0.31      0.35        13
          B-Year       0.98      1.00      0.99       703
         I-Actor       0.96      0.97      0.96      1731
         I-Award       0.89      0.83      0.86       202
I-Character_Name       0.91      0.53      0.67       165
      I-Director       0.92      0.90      0.91       456
         I-Ge

creating last dataframe from last report

In [316]:
last_df = pd.DataFrame(last_class_report).transpose()

print(last_df.sort_values(by="precision", ascending =True).head(5)["precision"])
print(last_df.sort_values(by="recall", ascending =True).head(5)["recall"])

B-Soundtrack    0.400000
I-Opinion       0.440476
I-Soundtrack    0.520000
B-Plot          0.605659
B-Opinion       0.683761
Name: precision, dtype: float64
B-Soundtrack        0.307692
I-Soundtrack        0.317073
I-Opinion           0.440476
B-Opinion           0.500000
B-Character_Name    0.514151
Name: recall, dtype: float64


<div class = "alert alert-block alert-warning">
    <b>Warning</b> The code below for comparison is taken from <b><a> https://www.thiscodeworks.com/display-two-dataframes-side-by-side-html-pandas-python-visualization/619285b06f88d700157a60d7></a></b> just for comparison
    
</div>

In [317]:
df1_styler = updated_df.style.set_table_attributes("style='display:inline'").set_caption('Normal Features')
df2_styler = last_df.style.set_table_attributes("style='display:inline'").set_caption('Additional Features')
 
display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,precision,recall,f1-score,support
B-Actor,0.914354,0.936637,0.925361,1231.0
B-Award,0.796875,0.68,0.733813,75.0
B-Character_Name,0.862069,0.235849,0.37037,212.0
B-Director,0.920746,0.836864,0.876804,472.0
B-Genre,0.906579,0.855901,0.880511,805.0
B-Opinion,0.638298,0.375,0.472441,160.0
B-Origin,0.68,0.534591,0.598592,159.0
B-Plot,0.566105,0.522523,0.543441,1221.0
B-Quote,0.833333,0.416667,0.555556,24.0
B-Relationship,0.869048,0.584,0.698565,125.0

Unnamed: 0,precision,recall,f1-score,support
B-Actor,0.952458,0.960195,0.956311,1231.0
B-Award,0.791045,0.706667,0.746479,75.0
B-Character_Name,0.893443,0.514151,0.652695,212.0
B-Director,0.923747,0.898305,0.910849,472.0
B-Genre,0.921594,0.890683,0.905875,805.0
B-Opinion,0.683761,0.5,0.577617,160.0
B-Origin,0.719424,0.628931,0.671141,159.0
B-Plot,0.605659,0.561016,0.582483,1221.0
B-Quote,0.777778,0.583333,0.666667,24.0
B-Relationship,0.835165,0.608,0.703704,125.0


End of File