In [1]:
import itertools
import pandas as pd
import csv
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

trainfile = '../data/conll2003.train.conll'
testfile = '../data/conll2003.dev.conll'

train_path = '../data/train.csv'
test_path = '../data/dev.csv'
feature_to_index = {
    'token': 1,
    'pos': 2,
    'chunk_tag': 3,
    'token_right': 5,
    'token_left': 6,
    'cap_type': 7}


#functions for feature extraction and training a classifier

## For documentation on how to create input representations of features in scikit-learn:
# https://scikit-learn.org/stable/modules/feature_extraction.html


def extract_features_and_gold_labels(conllfile, selected_features):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''

    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    #delimiter indicates we are working with a tab separated value (default is comma)
    #quotechar has as default value '"', which is used to indicate the borders of a cell containing longer pieces of text
    #in this file, we have only one token as text, but this token can be '"', which then messes up the format. We set quotechar to a character that does not occur in our file
    csvreader = csv.reader(conllinput, delimiter=',', quotechar='|')

    for index, row in enumerate(csvreader):
        #I preprocessed the file so that all rows with instances should contain 6 values, the others are empty lines indicating the beginning of a sentence
        if index == 0:
            continue
        if len(row) == 8:
            #structuring feature value pairs as key-value pairs in a dictionary
            #the first column in the conll file represents tokens
            feature_value = {}
            for feature_name in selected_features:
                row_index = feature_to_index.get(feature_name)
                feature_value[feature_name] = row[row_index]
            features.append(feature_value)
            #The last column provides the gold label (= the correct answer).
            labels.append(row[4])
    return features, labels


def get_predicted_and_gold_labels(testfile, vectorizer, classifier,
                                  selected_features):
    '''
    Function that extracts features and runs classifier on a test file returning predicted and gold labels
    
    :param testfile: path to the (preprocessed) test file
    :param vectorizer: vectorizer in which the mapping between feature values and dimensions is stored
    :param classifier: the trained classifier
    :type testfile: string
    :type vectorizer: DictVectorizer
    :type classifier: LogisticRegression()
    
    
    
    :return predictions: list of output labels provided by the classifier on the test file
    :return goldlabels: list of gold labels as included in the test file
    '''

    #we use the same function as above (guarantees features have the same name and form)
    features, goldlabels = extract_features_and_gold_labels(
        testfile, selected_features)
    #we need to use the same fitting as before, so now we only transform the current features according to this mapping (using only transform)
    test_features_vectorized = vectorizer.transform(features)
    predictions = classifier.predict(test_features_vectorized)

    return predictions, goldlabels


def extract_features_token_only_and_labels(conllfile):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''

    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    #delimiter indicates we are working with a tab separated value (default is comma)
    #quotechar has as default value '"', which is used to indicate the borders of a cell containing longer pieces of text
    #in this file, we have only one token as text, but this token can be '"', which then messes up the format. We set quotechar to a character that does not occur in our file
    csvreader = csv.reader(conllinput, delimiter='\t', quotechar='|')
    for row in csvreader:
        #I preprocessed the file so that all rows with instances should contain 6 values, the others are empty lines indicating the beginning of a sentence
        if len(row) == 4:
            #structuring feature value pairs as key-value pairs in a dictionary
            #the first column in the conll file represents tokens
            feature_value = {'Token': row[0]}
            features.append(feature_value)
            #The last column provides the gold label (= the correct answer).
            labels.append(row[-1])

    return features, labels


def create_vectorizer_and_classifier(features, labels):
    '''
    Function that takes feature-value pairs and gold labels as input and trains a logistic regression classifier
    
    :param features: feature-value pairs
    :param labels: gold labels
    :type features: a list of dictionaries
    :type labels: a list of strings
    
    :return lr_classifier: a trained LogisticRegression classifier
    :return vec: a DictVectorizer to which the feature values are fitted. 
    '''

    vec = DictVectorizer()
    #fit creates a mapping between observed feature values and dimensions in a one-hot vector, transform represents the current values as a vector
    tokens_vectorized = vec.fit_transform(features)
    lr_classifier = LogisticRegression(solver='saga')
    lr_classifier.fit(tokens_vectorized, labels)

    return lr_classifier, vec




def print_precision_recall_fscore(predictions, goldlabels):
    '''
    Function that prints out precision, recall and f-score
    
    :param predictions: predicted output by classifier
    :param goldlabels: original gold labels
    :type predictions, goldlabels: list of strings
    '''

    precision = metrics.precision_score(y_true=goldlabels,
                        y_pred=predictions,
                        average='macro')

    recall = metrics.recall_score(y_true=goldlabels,
                     y_pred=predictions,
                     average='macro')


    fscore = metrics.f1_score(y_true=goldlabels,
                 y_pred=predictions,
                 average='macro')

    print('P:', precision, 'R:', recall, 'F1:', fscore)

def print_confusion_matrix(predictions, goldlabels):
    '''
    Function that prints out a confusion matrix
    
    :param predictions: predicted labels
    :param goldlabels: gold standard labels
    :type predictions, goldlabels: list of strings
    '''



    #based on example from https://datatofish.com/confusion-matrix-python/
    data = {'Gold':    goldlabels, 'Predicted': predictions    }
    df = pd.DataFrame(data, columns=['Gold','Predicted'])

    confusion_matrix = pd.crosstab(df['Gold'], df['Predicted'], rownames=['Gold'], colnames=['Predicted'])
    print(confusion_matrix)

In [2]:
#feature_combinations = list(itertools.combinations(feature_to_index.keys(), 3))
#code taken from Google
feature_combinations = []
for i in range(1,len(feature_to_index.keys())+1):
    #feature_combination.append(i)
    els = [list(x) for x in itertools.combinations(feature_to_index.keys(), i)]
    feature_combinations.extend(els)

In [3]:
result = {}
for feature_c in feature_combinations:
    selected_features = list(feature_c)
    feature_values, labels = extract_features_and_gold_labels(
        train_path, selected_features)
    lr_classifier, vectorizer = create_vectorizer_and_classifier(
        feature_values, labels)
    predictions, goldlabels = get_predicted_and_gold_labels(
        test_path, vectorizer, lr_classifier, selected_features)
    f1 = f1_score(goldlabels, predictions, average = 'macro')
    result[str(feature_c)] = f1
    print(f'{feature_c}:{f1}')
    #print_confusion_matrix(predictions, goldlabels)
    #print_precision_recall_fscore(predictions, goldlabels)

['token']:0.6304389022623346




['pos']:0.1524705943692439




['chunk_tag']:0.1010403220293317
['token_right']:0.29283649315413807
['token_left']:0.3988159293006016




['cap_type']:0.1010403220293317




['token', 'pos']:0.6865216348284444




['token', 'chunk_tag']:0.6663961190397
['token', 'token_right']:0.7061000418999844
['token', 'token_left']:0.7553477188155355




['token', 'cap_type']:0.6892502909621512




['pos', 'chunk_tag']:0.21545855980450385




['pos', 'token_right']:0.4502580243784031




['pos', 'token_left']:0.5778741861383021




['pos', 'cap_type']:0.21474613835399492




['chunk_tag', 'token_right']:0.33461913681957856




['chunk_tag', 'token_left']:0.44926883761563197




['chunk_tag', 'cap_type']:0.15178917834248967
['token_right', 'token_left']:0.5461060482235068




['token_right', 'cap_type']:0.4594347871119166




['token_left', 'cap_type']:0.5915792313068553




['token', 'pos', 'chunk_tag']:0.7245114308569229




['token', 'pos', 'token_right']:0.74521958218065




['token', 'pos', 'token_left']:0.793045657952427




['token', 'pos', 'cap_type']:0.6949804569101775




['token', 'chunk_tag', 'token_right']:0.7319999052014984




['token', 'chunk_tag', 'token_left']:0.7707006274706293




['token', 'chunk_tag', 'cap_type']:0.7325121385037207
['token', 'token_right', 'token_left']:0.7948708360487059




['token', 'token_right', 'cap_type']:0.7514205407824479




['token', 'token_left', 'cap_type']:0.8035107779624737




['pos', 'chunk_tag', 'token_right']:0.5255604858464089




['pos', 'chunk_tag', 'token_left']:0.5788235086069191




['pos', 'chunk_tag', 'cap_type']:0.2801726295012383




['pos', 'token_right', 'token_left']:0.7027946043151473




['pos', 'token_right', 'cap_type']:0.5032192468107788




['pos', 'token_left', 'cap_type']:0.6457328799986862




['chunk_tag', 'token_right', 'token_left']:0.5801935822953305




['chunk_tag', 'token_right', 'cap_type']:0.5308836239080277




['chunk_tag', 'token_left', 'cap_type']:0.5990955307377868




['token_right', 'token_left', 'cap_type']:0.7088010977085981




['token', 'pos', 'chunk_tag', 'token_right']:0.7786676336687897




['token', 'pos', 'chunk_tag', 'token_left']:0.7953019809858402




['token', 'pos', 'chunk_tag', 'cap_type']:0.7387305037179581




['token', 'pos', 'token_right', 'token_left']:0.8312802263617312




['token', 'pos', 'token_right', 'cap_type']:0.7559952984265581




['token', 'pos', 'token_left', 'cap_type']:0.8065665232374933




['token', 'chunk_tag', 'token_right', 'token_left']:0.7992817886543178




['token', 'chunk_tag', 'token_right', 'cap_type']:0.7846781156864142




['token', 'chunk_tag', 'token_left', 'cap_type']:0.806482816626907




['token', 'token_right', 'token_left', 'cap_type']:0.8391818430575481




['pos', 'chunk_tag', 'token_right', 'token_left']:0.7065921298931437




['pos', 'chunk_tag', 'token_right', 'cap_type']:0.5683840935973925




['pos', 'chunk_tag', 'token_left', 'cap_type']:0.6493546527623658




['pos', 'token_right', 'token_left', 'cap_type']:0.7347573748466493




['chunk_tag', 'token_right', 'token_left', 'cap_type']:0.7119674851055976




['token', 'pos', 'chunk_tag', 'token_right', 'token_left']:0.8311344046827912




['token', 'pos', 'chunk_tag', 'token_right', 'cap_type']:0.7853267071641257




['token', 'pos', 'chunk_tag', 'token_left', 'cap_type']:0.8081598685108258




['token', 'pos', 'token_right', 'token_left', 'cap_type']:0.839356203265919




['token', 'chunk_tag', 'token_right', 'token_left', 'cap_type']:0.8361258062842417




['pos', 'chunk_tag', 'token_right', 'token_left', 'cap_type']:0.7397425619366605




['token', 'pos', 'chunk_tag', 'token_right', 'token_left', 'cap_type']:0.8397141262252669


In [5]:
len(result)

63