<a href="https://colab.research.google.com/github/Matthew-Dickson/COS801Assignment/blob/main/COS801Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h2>Some usefull information:</h2>


1.   https://www.kaggle.com/code/hsankesara/news-classification-using-han/notebook
2.   https://medium.com/analytics-vidhya/hierarchical-attention-networks-d220318cf87e



In [34]:
import pandas as pd
import requests, zipfile, io
import numpy as np

<h2>Dataset section </h2>
Datasets can be found here: https://archive.ics.uci.edu/ml/datasets.php

In [35]:
zip_file_url =  'https://archive.ics.uci.edu/ml/machine-learning-databases/00454/dataset.zip' 

# Download data set and place in current directory under content directory
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall("./content")
#testing

In [36]:
# Read training csv and place in panda data frame 
df = pd.read_csv(
    './content/dataset/Gungor_2018_VictorianAuthorAttribution_data-train.csv',
     encoding='latin-1'  
)

In [37]:
df.head()

Unnamed: 0,text,author
0,ou have time to listen i will give you the ent...,1
1,wish for solitude he was twenty years of age a...,1
2,and the skirt blew in perfect freedom about th...,1
3,of san and the rows of shops opposite impresse...,1
4,an hour s walk was as tiresome as three in a s...,1


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
#Split data into train and test data (90% train data and 10% test data)
#As required by assignment 
train_df, test_df = train_test_split(df, test_size=0.1)
print(len(train_df))
print(len(test_df))

48310
5368


In [40]:
#Getting data for train and test
X_train = train_df["text"]
X_test = test_df["text"]

#Getting labels for train and test
y_train = train_df["author"]
y_test = test_df["author"]

<h2>Data processing</h2>

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Bag of Words: Term Frequencey and TFIDF

In [85]:
'''Initialize count vectorizer and 
fits the data based on word ngrams for instance ngram = 1
will tokenize each individual word while ngram = 2 will
tokenize every 2 words. This returns a Document-term matrix and the
count vectorizer. Documentation can be found here:
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html'''

def initialise_term_frequency_vectorizer(data,ngram = 1):
    vectorizer_tf = CountVectorizer(ngram_range=(ngram,ngram))
    X = vectorizer_tf.fit_transform(data)
    return X, vectorizer_tf

In [86]:
'''Converts the vocabulary from word -> index to
index -> word format'''
def create_id2word_dictionary(vocabulary):
    id2word_dictionary = {}
    for key in vocabulary.keys():
      id2word_dictionary[vocabulary[key]] = key
    return id2word_dictionary

In [87]:
#initializing counter vectorizer inorder to get term frequency text representation
X, vectorizer_tf = initialise_term_frequency_vectorizer(X_train,2)

In [88]:
#Get frequecy of each word/token
token_counts = X.sum(axis=0)
list_token_counts = token_counts.tolist()[0]
#Sorting in descending order
sorted_index = np.argsort(list_token_counts)[::-1]

[2047067 1454522 3046033 ... 1591393 1591395 3437941]


In [89]:
#Check out the vocabulary generated by the counter vectorizer  
print(vectorizer_tf.get_feature_names_out())

['aa aa' 'aa ab' 'aa able' ... 'zest while' 'zest with' 'zest without']


In [90]:
id2word_dictionary = create_id2word_dictionary(vectorizer_tf.vocabulary_)

for index in sorted_index:
    print("Highest occuring word:%s - count:%s" % (id2word_dictionary[index], list_token_counts[index]))


<h2>Model section</h2>
<ol>
<li> https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough 
</li>
</ol>

In [3]:
import tensorflow as tf

In [5]:
"""Model creation
NB: We need to find out how to create custome layers"""

#This is just an example model taken from https://www.tensorflow.org/tutorials/customization/custom_training_walkthrough
model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(4,)),  # input shape required
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(3)
])

In [None]:
#Calculate loss using specified model and loss function
def loss(model, x, y, training = True,loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)):
  # training=training is needed only if there are layers with different
  # behavior during training versus inference (e.g. Dropout).
  y_ = model(x, training=training)
  return loss_function(y_true=y, y_pred=y_)


In [None]:
#Calculate loss values and gradients 
def calulate_loss_and_gradients(model, inputs, targets):
  with tf.GradientTape() as tape:
    loss_value = loss(model, inputs, targets, training=True)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [None]:
#Setting optimizer 
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

In [None]:
## Note: Rerunning this cell uses the same model parameters

# Keep results for plotting
train_loss_results = []
train_accuracy_results = []

num_epochs = 100

for epoch in range(num_epochs):

  #Metrics for evaluation  
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  # Training loop - using batches of 32
  for x, y in ds_train_batch:
    # Optimize the model
    loss_value, grads = calulate_loss_and_gradients(model, x, y)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss
    # Compare predicted label to actual label
    # training=True is needed only if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    epoch_accuracy.update_state(y, model(x, training=True))

  # End epoch
  train_loss_results.append(epoch_loss_avg.result())
  train_accuracy_results.append(epoch_accuracy.result())

  if epoch % 50 == 0:
    print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))