# Jupyter Notebook to accompany Part 2 of Coursework 1

## Peter Kennedy - 2092220

## Section 1 - Setup

In [1]:
### Code Block 1 - Importing and configuring libraries

import os
import math
import random
import nltk
import operator
import numpy as np
import sklearn
import spacy
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from datetime import datetime, timedelta

stopwords=set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("-")
stopwords.add("``")
stopwords.add("''")
stopwords.add("(")
stopwords.add(")")
stopwords.add("%")
stopwords.add(":")
stopwords.add("'")
stopwords.add("'s")

spacy_nlp = spacy.load('en_core_web_sm')

## Helper functions

The following functions are used within the 'main' functions

In [2]:
### Code Block 2 - Defenition of helper functions

# Input: name of folder in which articles are searched for
# Output: a list of labelled articles. The label given to each article is given by the name of its parent
#   folder.
# Remarks: Code modified from https://stackoverflow.com/questions/19587118/iterating-through-directories-with-python
def get_article_data(folder):
    rootdir = './datasets_coursework1/' + folder
    article_data = []

    for _, dirs, _ in os.walk(rootdir):
        for category in dirs:
            for subdir, _, articles in os.walk(os.path.join(rootdir, category)):
                for article_path in articles:
                    artile_text = open(os.path.join(subdir, article_path)).read()
                    article_data.append((artile_text, category))
                    
    return article_data  

# Input: article data which has been constructed in the format given in CB2.
# Returns: a list of categories from list of labelled articles.
def get_categories(article_data):
    categories = []
    for article in article_data:
        if article[1] not in categories:
            categories.append(article[1])
    return categories


# Input: a string
# Returns a list of lemmitsed, lower case tokens from a string
def get_lemmatized_tokens(string):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_tokens_list = []
    tokens=nltk.tokenize.word_tokenize(string)
    for token in tokens:
        lemmatized_tokens_list.append(lemmatizer.lemmatize(token).lower())
    return lemmatized_tokens_list


# Input: a vocabulary of words and a string
# Returns: vector which corresponds to the number of occurences of each word in the vocabulary
def get_vector_text(vocab, string):
    vector_text = np.zeros(len(vocab))
    list_tokens_string= get_lemmatized_tokens(string)
    for i, word in enumerate(vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)
    return vector_text


## Main functions

The following functions are the main functions which go to construct vocabularies, and vectorise strings in order to use them to train a classifier.

In [3]:
### Code Block 3 - Main functions

# Input: a subset of article data which has been constructed in the format given in CB2.
# Output: a list of article lengths
def get_article_lengths(article_data, categories):
    X = []
    Y = []
    for article in article_data:
        X.append(len(article[0]))
        Y.append(np.where(np.array(categories) == article[1])[0][0]) # Assigns a numerical label to the category
    
    return np.array(X).reshape(-1,1), np.array(Y)
        

# Input: a subset of article data which has been constructed in the format given in CB2 and a vocabulary length.
# Output: a list of the most commonly occuring words in the article data of length defined in the input
def get_simple_vocabulary(article_data, vocab_length):

    dict_word_frequency={}
    for article in article_data:
        tokens = get_lemmatized_tokens(article[0])
        for token in tokens:
            if token in stopwords: continue
            if token not in dict_word_frequency: dict_word_frequency[token] = 1
            else: dict_word_frequency[token] += 1

    sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:vocab_length]

    vocabulary=[]
    for word,frequency in sorted_list:
        vocabulary.append(word)
        
    return vocabulary


# Input: a subset of article data which has been constructed in the format given in CB2, an array of POS
#   tags and a categorical vocabulary length.
# Output: a list of the most commonly occuring words which have an allowed POS tag associated with it,
#   taken from the article data. The length of the vocabulary is the categorical vocabulary length (taken
#   as an input) multiplied by the number of categories.
# Remarks: This vocabulary is constructed in a slightly different way than the simple vocabulary. To ensure that
#   each category contributes to the vocabulary proportionally, a vocabulary for each category is first
#   constructed (which has a length equal to the desired final vocab length, divided by the number of categories).
#   Once this is done, these vocabularies are combined into an overall vocabulary. If duplicate words are found
#   to exist in this, then they are 'squashed' out (duplicates removed). The process is then repeated to ensure
#   that the final vocabulary is of the desired length.
def get_categorical_pos_vocabulary(article_data, pos_array, cat_vocab_length):

    category_list = get_categories(article_data)
    vocab_length = cat_vocab_length * len(category_list)
    vocab_to_add = cat_vocab_length * len(category_list)
    vocabulary = []
    
    # Continues loop till vocabulary reaches desired length after duplicates have been squashed
    while vocab_length > len(vocabulary):

        categorical_vocabularies = []

        for category in category_list:

            # Count occurences of words which aren't in the stopwords list and sort in frequency order
            dict_word_frequency={}
            for article in article_data:
                if article[1] == category:
                    doc = spacy_nlp(f'{article[0]}')
                    for token in doc:
                        if token.pos_ in pos_array:
                            if token.text in stopwords: continue
                            if token.text in vocabulary: continue
                            if token.text not in dict_word_frequency: dict_word_frequency[token.text] = 1
                            else: dict_word_frequency[token.text] += 1

            sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:math.ceil(vocab_to_add/len(category_list))]

            # Create a vocabulary for a given category
            categorical_vocabulary=[]
            for word,frequency in sorted_list:
                categorical_vocabulary.append(word)

            categorical_vocabularies.append(categorical_vocabulary)

        # Add to vocabulary
        for cat_vocab in categorical_vocabularies:
            for word in cat_vocab:
                vocabulary.append(word)

        # Squashes duplicate entries in vocabulary
        vocab_set = set(vocabulary)
        vocabulary = list(vocab_set)
        
        vocab_to_add = vocab_length - len(vocabulary)
        
        # Used to keep track of function progress
        print("Vocab length: " + str(len(vocabulary)))
        print("Vocab to add: " + str(vocab_to_add))
        
    if len(vocabulary) > vocab_length:
        vocabulary = vocabulary[:vocab_length]
        
    return vocabulary
    

# Input: an ordered list of categories, article data, and a vocabulary of words
# Output: a pair of numpy arrays. The first (X) is a vectorised form of the articles each of which have 
#   dimensionality of the length of the vocabulary. the second (Y) is a 1-dimensional array which simply
#   assigns a numerical value based on the position of the label in the categories array.
def xy_vector_split(categories, article_data, vocabulary):
    
    X = []
    Y = []

    for article in article_data:
        vector_article = get_vector_text(vocabulary, article[0])
        X.append(vector_article)
        Y.append(np.where(np.array(categories) == article[1])[0][0]) # Assigns a numerical label to the category
            
    return np.array(X), np.array(Y)

# Section 2 - Experimentation

## Parameters for experimentation with cross validation

Set the parameters in the CB4 and run CB5 to experiment with different parameters using k-fold cross validation.

In [40]:
### Code Block 4 - Parameters for experimentation with cross validation

################################# Setting up cross validation #################################

folds = 10

article_data = get_article_data('bbc/')

categories = get_categories(article_data)

################################# Feature extraction #################################

# To set the features you would like to extract, use 'Length', 'Simple' or 'POS' to extract the article length,
#  simple vocabulary or POS vocabulatry respectively.
features = 'POS'

# The vocabulary length is the total length of the vocabulary which is to be used for when the simple and POS
#  vocabularies are being constructed as features.
vocab_length = 3000


### The following code (currently commented out) was used to construct all possible combinations of POS.
### These combinations were then used to define the 'pos_combos' array, which CB will iterate through
### by default, so that a comparison between different combinations of POS may be made.
# pos = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB']
# pos_combos = []
# for r in range(1, len(pos) + 1):
#     pos_combos += list(combinations(pos, r))

if features == 'POS':
    pos_combos = [['NOUN', 'PROPN']] # Use this to set the parts of speech to be added to the vocabulary
else:                                #  if features is set to 'POS'
    pos_combos = [[]]

################################# Feature Engineering #################################

# Set to True if you would like to extract the best k features using the chi^2 comparison
set_k_best = True
# k_best must be less than or equal to the vocab_length
k_best = 1500

# Set to True if you would like to normalise the article vectors
set_normalise = False

# Set to True if you would like to apply the TF-IDF transformation to the article vectors
set_tfidf = True

svm_kernel = 'linear'

# time format defenition used to keep track of timings during experimentation
time_format = "%H:%M:%S %d/%m/%Y"

error = False

## Running experimentation with cross validation

Results will be printed below the code block as it runs, and once it is finished.

In [41]:
### Code Block 5 - Running experimentation with cross validation

# Fold parameters
kf = KFold(n_splits = folds)
random.shuffle(article_data)
kf.get_n_splits(article_data)

for pos_array in pos_combos:

    fold = 1
    accuracy_total = 0
    precision_total = 0
    recall_total = 0
    f1_total = 0

    if features == 'Length':
        print("Training using article length")
    elif features == 'Simple':
        print("Training using simple vocabulary")
    elif features == 'POS':
        print("Training using POS vocabulary with " + str(pos_array) + " tags")
    else:
        print("Error: Please enter a valid feature to extract in the parameters section above (line 15 of CB4)")
        break
        
    if set_k_best and features != 'Length':
        if k_best > vocab_length:
            print("Error: the number of features to be extracted (line 40 of CB4) must be less than the vocabulary length (line 19 of CB4)")
            break

    # Start the timer to keep track of how long K-fold validation of the experiment takes
    start = datetime.now()
    start_string = start.strftime(time_format)
    print("Start time: " + str(start_string) + "\n")
    
    # Begin K-fold validation
    for train_index, test_index in kf.split(article_data[:300]):
        
        # Construct a training and test set for the current fold
        train_set_fold = []
        test_set_fold = []
        
        print("Folding fold " + str(fold) + "/" + str(folds) + "...")
        for i,instance in enumerate(article_data):
            if i in train_index:
                train_set_fold.append(instance)
            else:
                test_set_fold.append(instance)
        
        if features == 'Length':
            
            # Check the length of the article
            print("Checking lengths of articles...")
            X_train_fold, Y_train_fold = get_article_lengths(train_set_fold, categories)
            X_test_fold, Y_test_fold = get_article_lengths(test_set_fold, categories)
        
        elif features == 'Simple' or features == 'POS':
        
            # Construct a simple or POS vocabulary depending on which parameters are set
            if features == 'Simple':
                print("Generating simple vocabulary...")
                vocab_fold = get_simple_vocabulary(train_set_fold, vocab_length)
            elif features == 'POS':
                print("Generating POS vocabulary...")
                vocab_fold = get_categorical_pos_vocabulary(train_set_fold, pos_array, math.floor(vocab_length/len(categories)))

            print("Generating XY vector split...")
            X_train_fold, Y_train_fold = xy_vector_split(categories, train_set_fold, vocab_fold)
            X_test_fold, Y_test_fold = xy_vector_split(categories, test_set_fold, vocab_fold)
            print("XY vector split for fold " + str(fold) + "/" + str(folds) + " complete")

            if set_k_best:
                print("Selecting best " + str(k_best) + " features...")
                features_fold = SelectKBest(chi2, k = k_best).fit(X_train_fold, Y_train_fold)
                X_train_fold = features_fold.transform(X_train_fold)
                X_test_fold = features_fold.transform(X_test_fold)

            if set_tfidf:
                print("Applying TF-IDF transformation...")
                tfidf = TfidfTransformer()
                X_train_fold = tfidf.fit_transform(X_train_fold)
                X_test_fold = tfidf.fit_transform(X_test_fold)
            
            # Since the TF-IDF transformer includes normalisation, the follwing code is only executed if 
            #  set_tfidf is False, and if set_normalise is True.
            elif set_normalise:
                print("Normalising vectors...")
                sc = StandardScaler()
                X_train_fold = sc.fit_transform(X_train_fold)
                X_test_fold = sc.fit_transform(X_test_fold)

        print("Fitting SVM with " + str(svm_kernel) + " kernel...")
        svm_clf_fold = sklearn.svm.SVC(kernel=svm_kernel, gamma='auto')
        svm_clf_fold.fit(X_train_fold, Y_train_fold)

        Y_test_predictions_fold = svm_clf_fold.predict(X_test_fold)

        accuracy_fold = accuracy_score(Y_test_fold, Y_test_predictions_fold)
        precision_fold = precision_score(Y_test_fold, Y_test_predictions_fold, average='macro')
        recall_fold = recall_score(Y_test_fold, Y_test_predictions_fold, average='macro')
        f1_fold = f1_score(Y_test_fold, Y_test_predictions_fold, average='macro')
        
        print("Accuracy of fold " + str(fold) + "/" + str(folds) + ": " + str(accuracy_fold))
        print("Precision of fold " + str(fold) + "/" + str(folds) + ": " + str(precision_fold))
        print("Recall of fold " + str(fold) + "/" + str(folds) + ": " + str(recall_fold))
        print("F1-Score of fold " + str(fold) + "/" + str(folds) + ": " + str(f1_fold) + "\n")

        accuracy_total += accuracy_fold
        precision_total += precision_fold
        recall_total += recall_fold
        f1_total += f1_fold
        fold += 1

    accuracy_average = accuracy_total/folds
    precision_average = precision_total/folds
    recall_average = recall_total/folds
    f1_average = f1_total/folds

    print("Average accuracy: " + str(accuracy_average))
    print("Average precision: " + str(precision_average))
    print("Average recall: " + str(recall_average))
    print("Average F1-score: " + str(f1_average) + "\n")

    end = datetime.now()
    end_string = end.strftime(time_format)
    print("End time: " + str(end_string))
    time_taken = end - start
    print("Time taken: " + str(time_taken))

    print("\n----------------------------------------\n----------------------------------------\n\n")
    
    

Training using POS vocabulary with ['NOUN', 'PROPN'] tags
Start time: 23:44:24 19/04/2021

Folding fold 1/10...
Generating POS vocabulary...
Vocab length: 2171
Vocab to add: 829
Vocab length: 2969
Vocab to add: 31
Vocab length: 3004
Vocab to add: -4
Generating XY vector split...
XY vector split for fold 1/10 complete
Selecting best 1500 features...
Applying TF-IDF transformation...
Fitting SVM with linear kernel...
Accuracy of fold 1/10: 0.9483375959079284
Precision of fold 1/10: 0.9507948715355304
Recall of fold 1/10: 0.9460189950505983
F1-Score of fold 1/10: 0.9479495711023436

Folding fold 2/10...
Generating POS vocabulary...
Vocab length: 2193
Vocab to add: 807
Vocab length: 2961
Vocab to add: 39
Vocab length: 3001
Vocab to add: -1
Generating XY vector split...
XY vector split for fold 2/10 complete
Selecting best 1500 features...
Applying TF-IDF transformation...
Fitting SVM with linear kernel...
Accuracy of fold 2/10: 0.9432225063938618
Precision of fold 2/10: 0.9464999568105789


## Parameters for full dataset training

Set the parameters in CB6 and run CB7 to train a classifier which is trained on the entire BBC news article dataset.

This classifier may then be used to classify previously unseen news articles.

In [24]:
### Code block 6 - Parameters for full dataset training

article_data = get_article_data('bbc/')

categories = get_categories(article_data)

################################# Feature extraction #################################

# To set the features you would like to extract, use 'Length', 'Simple' or 'POS' to extract the article length,
#  simple vocabulary or POS vocabulatry respectively.
features = 'POS'

# The vocabulary length is the total length of the vocabulary which is to be used for when the simple and POS
#  vocabularies are being constructed as features.
vocab_length = 3000

# Use this to set the parts of speech to be added to the vocabulary
pos_combos = ['NOUN', 'PROPN']

################################# Feature Engineering #################################
    
# Set to True if you would like to extract the best k features using the chi^2 comparison
set_k_best = True
# k_best must be less than or equal to the vocab_length
k_best = 1500

# Set to True if you would like to normalise the article vectors
set_normalise = False

# Set to True if you would like to apply the TF-IDF transformation to the article vectors
set_tfidf = True

svm_kernel = 'linear'

# time format defenition used to keep track of timings during experimentation
time_format = "%H:%M:%S %d/%m/%Y"

error = False

## Training using full entire dataset

This code block can be used to train a classifier on the entire BBC news article dataset, based on the best combinations of parameters that are found in the experimentation block above.

This model may then be used to classify completely unseen articles.

In [25]:
### Code block 7 - Training using entire dataset

if features == 'Length':
    print("Training using article length")
elif features == 'Simple':
    print("Training using simple vocabulary")
elif features == 'POS':
    print("Training using POS vocabulary with " + str(pos_array) + " tags")
else:
    print("Error: Please enter a valid feature to extract in the parameters section above (line # of CB #)")
    error = True

if set_k_best and features != 'Length':
    if k_best > vocab_length:
        print("Error: the number of features to be extracted must be less than the vocabulary length")
        error = True

if error == False:
        
    # Start the timer to keep track of how long K-fold validation of the experiment takes
    start = datetime.now()
    start_string = start.strftime(time_format)
    print("Start time: " + str(start_string) + "\n")

    if features == 'Length':

        # Check the length of the article
        print("Checking lengths of articles...")
        X_train, Y_train = get_article_lengths(article_data, categories)

    elif features == 'Simple' or features == 'POS':

        # Construct a simple or POS vocabulary depending on which parameters are set
        if features == 'Simple':
            print("Generating simple vocabulary...")
            vocab = get_simple_vocabulary(article_data, vocab_length)
        elif features == 'POS':
            print("Generating POS vocabulary...")
            vocab = get_categorical_pos_vocabulary(article_data, pos_array, math.floor(vocab_length/len(categories)))

        print("Generating XY vector split...")
        X_train, Y_train = xy_vector_split(categories, article_data, vocab)
        print("XY vector split complete")

        if set_k_best:
            print("Selecting best " + str(k_best) + " features...")
            features = SelectKBest(chi2, k = k_best).fit(X_train, Y_train)
            X_train = features.transform(X_train)

            
        if set_tfidf:
            print("Applying TF-IDF transformation...")
            tfidf = TfidfTransformer()
            X_train = tfidf.fit_transform(X_train)

        if set_normalise:
            print("Normalising vectors...")
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)

    print("Fitting SVM with " + str(svm_kernel) + " kernel...")
    svm_clf = sklearn.svm.SVC(kernel=svm_kernel, gamma='auto')
    svm_clf.fit(X_train, Y_train)

    end = datetime.now()
    end_string = end.strftime(time_format)
    print("End time: " + str(end_string))
    time_taken = end - start
    print("Time taken: " + str(time_taken))
    print("The model 'svm_clf' may now be used to make new predictions")

Training using simple vocabulary
Start time: 21:24:01 19/04/2021

Generating simple vocabulary...
Generating XY vector split...
XY vector split complete
Selecting best 2000 features...
Normalising vectors...
Fitting SVM with linear kernel...
End time: 21:24:53 19/04/2021
Time taken: 0:00:51.788853
The model 'svm_clf' may now be used to make new predictions


## Making predictions on unseen data

Using the SVM classifier trained in CB7, 'svm_clf' may be used to classify unseen data.

This data should ideally be structured in the same way as it was in the original BBC news article data i.e.

Although the classifier will still be able to run on any articles which are found within the sub-folders of the 'unseen', if the structure differs from that above, it will not be possible to measure performance metrics by simply running the CB8.

In the dummy data included in the GitHub repository, 3 additional articles from the BBC news website have been added (one each within business, entertainment and policy sub-folders). These files may simply be removed to check the performance of the trained model on unseen data.

In [26]:
### Code Block 8 - Making predictions on unseen data

unseen_data = get_article_data('unseen/')

X_unseen, Y_unseen = xy_vector_split(categories, unseen_data, vocab)


if set_k_best:
    X_unseen = features.transform(X_unseen)
    
if set_tfidf:
    X_unseen = tfidf.fit_transform(X_unseen)
    
if set_normalise:
    X_unseen = sc.fit_transform(X_unseen)

Y_unseen_predictions = svm_clf.predict(X_unseen)

accuracy_unseen = accuracy_score(Y_unseen, Y_unseen_predictions)
precision_unseen = precision_score(Y_unseen, Y_unseen_predictions, average='macro')
recall_unseen = recall_score(Y_unseen, Y_unseen_predictions, average='macro')
f1_unseen = f1_score(Y_unseen, Y_unseen_predictions, average='macro')

print("Average accuracy: " + str(accuracy_unseen))
print("Average precision: " + str(precision_unseen))
print("Average recall: " + str(recall_unseen))
print("Average F1-score: " + str(f1_unseen))

Average accuracy: 0.6666666666666666
Average precision: 0.5
Average recall: 0.6666666666666666
Average F1-score: 0.5555555555555555


  _warn_prf(average, modifier, msg_start, len(result))
