In [5]:
# Packages needed for to store data

import lab_utils
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
# Working folder for the experiment (old model)
BASE_DIR = './E1'

# Get the subdirectories that contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

print(
    f'base directory: {BASE_DIR}\n\n'
    f'data: {data_dir}\n'
    f'model: {model_dir}\n'
    f'vocab: {vocab_dir}\n'
)

base directory: ./E1

data: ./E1/data
model: ./E1/model
vocab: ./E1/vocab



In [7]:
# Set the column width so you can see the entire length of the `title` column
pd.set_option('display.max_colwidth', None)

# Load the datasets into dataframes
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Preview the first 10 rows of the training set
train_df[:10]

Unnamed: 0,title,link,domain,published_date,topic
0,"NSW remains on COVID-19 high alert | The Star | Newcastle, NSW",https://www.newcastlestar.com.au/story/6878465/nsw-remains-on-covid-19-high-alert/?cs=7,newcastlestar.com.au,2020-08-13 17:32:28,HEALTH
1,Missing people: How does someone just disappear?,https://www.bbc.co.uk/news/uk-england-cambridgeshire-53648599,bbc.co.uk,2020-08-04 15:56:20,NATION
2,"An American Pickle review: In a dual role, Seth Rogen’s drama wins over comedy",https://www.polygon.com/2020/8/6/21357120/american-pickle-review-movie-seth-rogen-simon-rich-hbo-max,polygon.com,2020-08-06 15:58:22,ENTERTAINMENT
3,"Katie Spotz to Run 130 Miles to Bring Clean Water to Tanzania, Shares Testimony of Change",http://www.christianitydaily.com/articles/9647/20200807/katie-spotz-run-130-miles-bring-clean-water-tanzania-shares.htm,christianitydaily.com,2020-08-07 19:45:00,NATION
4,CDC Director Says This Fall Could Be ‘the Worst’ We’ve Ever Had Thanks to COVID-19,https://www.self.com/story/worst-fall-ever-covid,self.com,2020-08-13 19:37:52,HEALTH
5,Free water and electricity may be counterproductive – Economist,https://www.ghanaweb.com/GhanaHomePage/NewsArchive/Free-water-and-electricity-may-be-counterproductive-Economist-1031098,ghanaweb.com,2020-08-11 13:25:37,NATION
6,State places Tazewell County on COVID-19 warning list,https://week.com/2020/08/07/state-places-tazewell-county-on-covid-19-warning-list/,week.com,2020-08-07 18:00:24,HEALTH
7,Over 100 people quarantined in Mississippi school district after several test positive for coronavirus,https://www.nbcnews.com/news/us-news/over-100-people-quarantined-mississippi-school-district-after-several-test-n1236012,nbcnews.com,2020-08-06 16:35:00,NATION
8,How global warming is wiping out Africa's oldest baobab trees,http://www.capetalk.co.za/articles/307750/how-global-warming-is-wiping-out-africa-s-oldest-baobab-trees,capetalk.co.za,2018-06-14 12:00:53,WORLD
9,"Delay routine dental checkups, WHO urges, until COVID risk is known","https://www.thepeninsulaqatar.com/article/11/08/2020/Delay-routine-dental-checkups,-WHO-urges,-until-COVID-risk-is-known",thepeninsulaqatar.com,2020-08-11 17:13:00,HEALTH


In [8]:
train_df[['title', 'topic']]

Unnamed: 0,title,topic
0,"NSW remains on COVID-19 high alert | The Star | Newcastle, NSW",HEALTH
1,Missing people: How does someone just disappear?,NATION
2,"An American Pickle review: In a dual role, Seth Rogen’s drama wins over comedy",ENTERTAINMENT
3,"Katie Spotz to Run 130 Miles to Bring Clean Water to Tanzania, Shares Testimony of Change",NATION
4,CDC Director Says This Fall Could Be ‘the Worst’ We’ve Ever Had Thanks to COVID-19,HEALTH
...,...,...
84370,Apple fires back at Epic: ‘We won’t make an exception’,TECHNOLOGY
84371,Actress Jessica Marais mourns loss of mother Karen,ENTERTAINMENT
84372,Google delays Chrome web apps drop-dead date until mid-2022,TECHNOLOGY
84373,"Liverpool evening headlines as Firmino scouted by Barcelona legend, Fabinho hails special success",SPORTS


In [9]:
# Load the model
model = tf.keras.models.load_model(model_dir)

# Show the model architecture
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 24)            240000    
                                                                 
 dense_2 (Dense)             (None, 20, 24)            600       
                                                                 
 flatten_1 (Flatten)         (None, 480)               0         
                                                                 
 dense_3 (Dense)             (None, 8)                 3848      
                                                                 
Total params: 244448 (954.88 KB)
Trainable params: 244448 (954.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model.get_compile_config()

{'optimizer': {'module': 'keras.optimizers',
  'class_name': 'Adam',
  'config': {'name': 'Adam',
   'weight_decay': None,
   'clipnorm': None,
   'global_clipnorm': None,
   'clipvalue': None,
   'use_ema': False,
   'ema_momentum': 0.99,
   'ema_overwrite_frequency': None,
   'jit_compile': False,
   'is_legacy_optimizer': False,
   'learning_rate': 0.0010000000474974513,
   'beta_1': 0.9,
   'beta_2': 0.999,
   'epsilon': 1e-07,
   'amsgrad': False},
  'registered_name': None},
 'loss': {'module': 'builtins',
  'class_name': 'function',
  'config': 'sparse_categorical_crossentropy',
  'registered_name': 'function'},
 'metrics': [[{'module': 'keras.metrics',
    'class_name': 'MeanMetricWrapper',
    'config': {'name': 'sparse_categorical_accuracy',
     'dtype': 'float32',
     'fn': {'module': 'builtins',
      'class_name': 'function',
      'config': 'sparse_categorical_accuracy',
      'registered_name': 'function'}},
    'registered_name': None}]],
 'loss_weights': None,
 'weig

In [12]:
# Create a lookup list for the labels
topic_lookup = tf.keras.layers.StringLookup(vocabulary=f'{vocab_dir}/labels.txt', num_oov_indices=0)

# Check the list of labels
topic_lookup.get_vocabulary()

['ENTERTAINMENT',
 'HEALTH',
 'TECHNOLOGY',
 'WORLD',
 'BUSINESS',
 'SPORTS',
 'NATION',
 'SCIENCE']

In [21]:
import tensorflow as tf
import pandas as pd


# Title length and vocabulary size used by the team for the prototype
MAX_LENGTH = 20
VOCAB_SIZE = 10000

# Instantiate a layer for text preprocessing
title_preprocessor = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=MAX_LENGTH)

# Load the vocabulary file
vocab_path = f'{vocab_dir}/vocabulary.txt'
with open(vocab_path, 'r', encoding='utf-8') as file:
    vocab = [line.strip() for line in file]

# Set the vocabulary for the TextVectorization layer
title_preprocessor.set_vocabulary(vocab)

# Check the vocabulary size
print(f'Vocabulary size: {title_preprocessor.vocabulary_size()}')

# Get a sample title
sample_title = train_df['title'][10]

# Sample title in string format
print(f"Sample text: {sample_title}")

# Sample title represented as an integer sequence
print(f"Sample text (preprocessed): {title_preprocessor(sample_title).numpy()}")


Vocabulary size: 10000
Sample text: Dengue fever cases in Laos rise to 4256
Sample text (preprocessed): [4040 1979   30    2 9339  282    3    1    0    0    0    0    0    0
    0    0    0    0    0    0]


In [20]:
# Convert the test dataframe to a tf dataset
test_ds = lab_utils.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Get the metrics
model.evaluate(test_ds)



[1.0201756954193115, 0.7778187394142151]