In [99]:
# Packages needed for to store data

import lab_utils
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [100]:
# Working folder for the experiment (old model)
BASE_DIR = './E1'

# Get the subdirectories that contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

print(
    f'base directory: {BASE_DIR}\n\n'
    f'data: {data_dir}\n'
    f'model: {model_dir}\n'
    f'vocab: {vocab_dir}\n'
)

base directory: ./E1

data: ./E1/data
model: ./E1/model
vocab: ./E1/vocab



In [101]:
# Set the column width so you can see the entire length of the `title` column
pd.set_option('display.max_colwidth', None)

# Load the datasets into dataframes
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Preview the first 10 rows of the training set
train_df[:10]

Unnamed: 0,title,link,domain,published_date,topic
0,"NSW remains on COVID-19 high alert | The Star | Newcastle, NSW",https://www.newcastlestar.com.au/story/6878465/nsw-remains-on-covid-19-high-alert/?cs=7,newcastlestar.com.au,2020-08-13 17:32:28,HEALTH
1,Missing people: How does someone just disappear?,https://www.bbc.co.uk/news/uk-england-cambridgeshire-53648599,bbc.co.uk,2020-08-04 15:56:20,NATION
2,"An American Pickle review: In a dual role, Seth Rogen’s drama wins over comedy",https://www.polygon.com/2020/8/6/21357120/american-pickle-review-movie-seth-rogen-simon-rich-hbo-max,polygon.com,2020-08-06 15:58:22,ENTERTAINMENT
3,"Katie Spotz to Run 130 Miles to Bring Clean Water to Tanzania, Shares Testimony of Change",http://www.christianitydaily.com/articles/9647/20200807/katie-spotz-run-130-miles-bring-clean-water-tanzania-shares.htm,christianitydaily.com,2020-08-07 19:45:00,NATION
4,CDC Director Says This Fall Could Be ‘the Worst’ We’ve Ever Had Thanks to COVID-19,https://www.self.com/story/worst-fall-ever-covid,self.com,2020-08-13 19:37:52,HEALTH
5,Free water and electricity may be counterproductive – Economist,https://www.ghanaweb.com/GhanaHomePage/NewsArchive/Free-water-and-electricity-may-be-counterproductive-Economist-1031098,ghanaweb.com,2020-08-11 13:25:37,NATION
6,State places Tazewell County on COVID-19 warning list,https://week.com/2020/08/07/state-places-tazewell-county-on-covid-19-warning-list/,week.com,2020-08-07 18:00:24,HEALTH
7,Over 100 people quarantined in Mississippi school district after several test positive for coronavirus,https://www.nbcnews.com/news/us-news/over-100-people-quarantined-mississippi-school-district-after-several-test-n1236012,nbcnews.com,2020-08-06 16:35:00,NATION
8,How global warming is wiping out Africa's oldest baobab trees,http://www.capetalk.co.za/articles/307750/how-global-warming-is-wiping-out-africa-s-oldest-baobab-trees,capetalk.co.za,2018-06-14 12:00:53,WORLD
9,"Delay routine dental checkups, WHO urges, until COVID risk is known","https://www.thepeninsulaqatar.com/article/11/08/2020/Delay-routine-dental-checkups,-WHO-urges,-until-COVID-risk-is-known",thepeninsulaqatar.com,2020-08-11 17:13:00,HEALTH


In [102]:
train_df[['title', 'topic']]

Unnamed: 0,title,topic
0,"NSW remains on COVID-19 high alert | The Star | Newcastle, NSW",HEALTH
1,Missing people: How does someone just disappear?,NATION
2,"An American Pickle review: In a dual role, Seth Rogen’s drama wins over comedy",ENTERTAINMENT
3,"Katie Spotz to Run 130 Miles to Bring Clean Water to Tanzania, Shares Testimony of Change",NATION
4,CDC Director Says This Fall Could Be ‘the Worst’ We’ve Ever Had Thanks to COVID-19,HEALTH
...,...,...
84370,Apple fires back at Epic: ‘We won’t make an exception’,TECHNOLOGY
84371,Actress Jessica Marais mourns loss of mother Karen,ENTERTAINMENT
84372,Google delays Chrome web apps drop-dead date until mid-2022,TECHNOLOGY
84373,"Liverpool evening headlines as Firmino scouted by Barcelona legend, Fabinho hails special success",SPORTS


In [103]:
# Load the model
model = tf.keras.models.load_model(model_dir)

# Show the model architecture
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 24)            240000    
                                                                 
 dense_2 (Dense)             (None, 20, 24)            600       
                                                                 
 flatten_1 (Flatten)         (None, 480)               0         
                                                                 
 dense_3 (Dense)             (None, 8)                 3848      
                                                                 
Total params: 244448 (954.88 KB)
Trainable params: 244448 (954.88 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [104]:
model.get_compile_config()

{'optimizer': {'module': 'keras.optimizers',
  'class_name': 'Adam',
  'config': {'name': 'Adam',
   'weight_decay': None,
   'clipnorm': None,
   'global_clipnorm': None,
   'clipvalue': None,
   'use_ema': False,
   'ema_momentum': 0.99,
   'ema_overwrite_frequency': None,
   'jit_compile': False,
   'is_legacy_optimizer': False,
   'learning_rate': 0.0010000000474974513,
   'beta_1': 0.9,
   'beta_2': 0.999,
   'epsilon': 1e-07,
   'amsgrad': False},
  'registered_name': None},
 'loss': {'module': 'builtins',
  'class_name': 'function',
  'config': 'sparse_categorical_crossentropy',
  'registered_name': 'function'},
 'metrics': [[{'module': 'keras.metrics',
    'class_name': 'MeanMetricWrapper',
    'config': {'name': 'sparse_categorical_accuracy',
     'dtype': 'float32',
     'fn': {'module': 'builtins',
      'class_name': 'function',
      'config': 'sparse_categorical_accuracy',
      'registered_name': 'function'}},
    'registered_name': None}]],
 'loss_weights': None,
 'weig

In [105]:
# Create a lookup list for the labels
topic_lookup = tf.keras.layers.StringLookup(vocabulary=f'{vocab_dir}/labels.txt', num_oov_indices=0)

# Check the list of labels
topic_lookup.get_vocabulary()

['ENTERTAINMENT',
 'HEALTH',
 'TECHNOLOGY',
 'WORLD',
 'BUSINESS',
 'SPORTS',
 'NATION',
 'SCIENCE']

In [106]:
import tensorflow as tf
import pandas as pd


# Title length and vocabulary size used by the team for the prototype
MAX_LENGTH = 20
VOCAB_SIZE = 10000

# Instantiate a layer for text preprocessing
title_preprocessor = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=MAX_LENGTH)

# Load the vocabulary file
vocab_path = f'{vocab_dir}/vocabulary.txt'
with open(vocab_path, 'r', encoding='utf-8') as file:
    vocab = [line.strip() for line in file]

# Set the vocabulary for the TextVectorization layer
title_preprocessor.set_vocabulary(vocab)

# Check the vocabulary size
print(f'Vocabulary size: {title_preprocessor.vocabulary_size()}')

# Get a sample title
sample_title = train_df['title'][10]

# Sample title in string format
print(f"Sample text: {sample_title}")

# Sample title represented as an integer sequence
print(f"Sample text (preprocessed): {title_preprocessor(sample_title).numpy()}")


Vocabulary size: 10000
Sample text: Dengue fever cases in Laos rise to 4256
Sample text (preprocessed): [4040 1979   30    2 9339  282    3    1    0    0    0    0    0    0
    0    0    0    0    0    0]


In [107]:
# Convert the test dataframe to a tf dataset
test_ds = lab_utils.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Get the metrics
model.evaluate(test_ds)



[1.0201756954193115, 0.7778187394142151]

In [108]:
# Print the percentage of each class in the train set
train_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

topic
BUSINESS         13.2%
ENTERTAINMENT    14.1%
HEALTH           15.1%
NATION           13.2%
SCIENCE           0.7%
SPORTS           13.6%
TECHNOLOGY       15.3%
WORLD            14.8%
Name: proportion, dtype: object

In [109]:
# Print the percentage of each class in the test set
test_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

topic
BUSINESS         15.9%
ENTERTAINMENT    12.7%
HEALTH            9.3%
NATION           15.9%
SCIENCE          13.2%
SPORTS           14.3%
TECHNOLOGY        8.6%
WORLD            10.1%
Name: proportion, dtype: object

In [110]:
# Load the train and test sets
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Combine the two datasets. Set ignore_index to False.
combined_df = pd.concat([train_df,test_df], ignore_index=True)

train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['topic'])
train_df, dev_df = train_test_split(train_df, test_size=0.25, stratify=train_df['topic'])

In [111]:
train_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object

In [112]:
dev_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'


topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object

In [113]:
test_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object

In [114]:
# Set the base directory for the second experiment
BASE_DIR = './E2'

# Set the subdirectories that will contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

# Save the datasets
lab_utils.save_data(train_df, data_dir, 'train_data.csv')
lab_utils.save_data(dev_df, data_dir, 'dev_data.csv')
lab_utils.save_data(test_df, data_dir, 'test_data.csv')

# Save the labels
lab_utils.save_labels(topic_lookup, vocab_dir)

In [115]:
# Extract the titles from the new training set
import importlib


train_inputs = train_df['title']

# Generate a new vocabulary
title_preprocessor.adapt(train_inputs)
# importlib.reload(lab_utils)
# Save the new vocabulary
lab_utils.save_vocab_manually(title_preprocessor, vocab_dir)

In [116]:
NUM_EPOCHS = 5

# Convert the string datasets to Tensorflow datasets
train_ds = lab_utils.df_to_tfdata(train_df, topic_lookup, title_preprocessor, shuffle=True)
dev_ds = lab_utils.df_to_tfdata(dev_df, topic_lookup, title_preprocessor)
test_ds = lab_utils.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Reset the model weights
model = lab_utils.model_reset_weights(model)

# Train the model. Use the dev set to check if your model is overfitting.
model.fit(train_ds, epochs=NUM_EPOCHS, validation_data=dev_ds, verbose=1)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x193156bca60>

In [117]:
# Compute the loss and metrics
model.evaluate(test_ds)



[0.6568235754966736, 0.8100206851959229]

In [118]:
# Save the model
model.save(model_dir)

INFO:tensorflow:Assets written to: ./E2/model\assets


INFO:tensorflow:Assets written to: ./E2/model\assets


In [119]:
# Get the list of topics
topics = topic_lookup.get_vocabulary()

# Evaluate the model's performance for each topic
lab_utils.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model)

ACCURACY PER TOPIC:

ENTERTAINMENT: 82.30 
HEALTH: 81.07 
TECHNOLOGY: 86.20 
WORLD: 65.13 
BUSINESS: 100.00 
SPORTS: 89.13 
NATION: 63.77 
SCIENCE: 76.69 


In [120]:
train_df[train_df.topic=='BUSINESS']

Unnamed: 0,title,link,domain,published_date,topic
88250,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.prolificnorth.co.uk/news/social-media-agency-news/2020/08/co-founders-steven-bartlett-and-dominic-mcgregor-leave-social,prolificnorth.co.uk,2020-08-17 11:15:10,BUSINESS
42951,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://asia.nikkei.com/Spotlight/Huawei-crackdown/New-US-sanctions-put-pressure-on-Huawei-s-private-sector-deals,asia.nikkei.com,2020-08-13 18:22:00,BUSINESS
37493,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.bworldonline.com/bsp-sees-no-reason-for-further-easing/,bworldonline.com,2020-08-10 20:09:02,BUSINESS
64634,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.npr.org/sections/coronavirus-live-updates/2020/08/14/901663621/midsummer-shopping-okay-despite-covid-spikes-july-retail-sales-rise-1-2,npr.org,2020-08-14 12:53:01,BUSINESS
36091,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.usatoday.com/story/news/weather/2020/08/17/california-death-valley-record-high-heatwave-blackouts/5598370002/?utm_source=feedblitz&utm_medium=FeedBlitzRss&utm_campaign=usatoday-newstopstories,rssfeeds.usatoday.com,2020-08-17 17:39:49,BUSINESS
...,...,...,...,...,...
82653,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://thepoultrysite.com/news/2020/08/philippines-suspends-poultry-imports-from-brazil-on-covid-19-worries,thepoultrysite.com,2020-08-17 07:03:37,BUSINESS
44554,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.globenewswire.com/news-release/2020/08/04/2072373/0/en/Global-Vanilla-Bean-Market-Insights-2020-2025-Production-Consumption-Trade-and-Price-Trend-Analysis.html,globenewswire.com,2020-08-04 11:43:00,BUSINESS
92203,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.bloombergquint.com/business/citi-s-900-million-blunder-raises-stakes-in-revlon-showdown,bloombergquint.com,2020-08-15 01:52:11,BUSINESS
52461,Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story,https://www.moneycontrol.com/news/business/nmdc-hikes-iron-ore-rates-by-rs-300-a-tonne-5689531.html,moneycontrol.com,2020-08-12 08:39:00,BUSINESS


In [121]:
# Set the experiment folder
BASE_DIR = './E3'

# Set the subdirectories that will contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

# Load the backup CSV
combined_df = pd.read_csv(f'./backup.csv')

# Generate train, dev, and test sets as you did before.
train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['topic'])
train_df, dev_df = train_test_split(train_df, test_size=0.25, stratify=train_df['topic'])

# Save the datasets under the E3 folder
lab_utils.save_data(train_df, data_dir, 'train_data.csv')
lab_utils.save_data(dev_df, data_dir, 'dev_data.csv')
lab_utils.save_data(test_df, data_dir, 'test_data.csv')

In [122]:
# Generate a new vocabulary based on the new training set
train_inputs = train_df['title']
title_preprocessor.adapt(train_inputs)

# Save the new vocabulary and labels
lab_utils.save_vocab_manually(title_preprocessor, vocab_dir)
lab_utils.save_labels(topic_lookup, vocab_dir)



In [123]:
NUM_EPOCHS = 5

# Convert the dataframes to numeric features. Remember to shuffle the training set.
train_ds = lab_utils.df_to_tfdata(train_df, topic_lookup, title_preprocessor, shuffle=True)
dev_ds = lab_utils.df_to_tfdata(dev_df, topic_lookup, title_preprocessor)
test_ds = lab_utils.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Reset the model weights
model = lab_utils.model_reset_weights(model)

# Train the model
model.fit(train_ds, epochs=NUM_EPOCHS, validation_data=dev_ds, verbose=1)

Epoch 1/5
  38/2040 [..............................] - ETA: 8s - loss: 2.0632 - sparse_categorical_accuracy: 0.1423



Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x19316519730>

In [124]:
# Evaluate the model on the test set and write the results on the experiment tracker
model.evaluate(test_ds)

# Save the model to model_dir
model.save(model_dir)

  1/680 [..............................] - ETA: 12s - loss: 1.0258 - sparse_categorical_accuracy: 0.7812

INFO:tensorflow:Assets written to: ./E3/model\assets


INFO:tensorflow:Assets written to: ./E3/model\assets


In [125]:
lab_utils.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model)


ACCURACY PER TOPIC:

ENTERTAINMENT: 81.97 
HEALTH: 76.37 
TECHNOLOGY: 84.43 
WORLD: 61.70 
BUSINESS: 74.37 
SPORTS: 88.33 
NATION: 58.50 
SCIENCE: 76.82 


In [126]:
lab_utils.get_errors(model, dev_df, title_preprocessor, topic_lookup, 'NATION')

label: WORLD
prediction: NATION
title: Queensland car-fire murders: Hannah Clarke told police of ordeal before death

label: HEALTH
prediction: NATION
title: Infected family at centre of new cluster 'shell-shocked'

label: WORLD
prediction: NATION
title: ECNEC approves $6.8bn for Pakistan Railways' ML-1 Project

label: WORLD
prediction: NATION
title: Kerala Resident among Suicide Attackers Slain in Strike on Jail in Afghanistan’s Jalalabad

label: HEALTH
prediction: NATION
title: California coronavirus case count tops 600,000 | TheHill

label: HEALTH
prediction: NATION
title: Infertility: 621 medical experts advocate better management of endometriosis

label: SPORTS
prediction: NATION
title: Gloucester City AFC "heartbroken" after break in at Meadow Park

label: BUSINESS
prediction: NATION
title: Couple upset KiwiSaver funds can't be used to pay off debt

label: ENTERTAINMENT
prediction: NATION
title: Call centre busted, Gurugram’s second in three days

label: WORLD
prediction: NATION


In [127]:
# Set the top-K accuracy to 2
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(k=2)]             
             )

# Check the accuracy
model.evaluate(dev_ds)



[0.8428835272789001, 0.8810847997665405]

In [128]:
# Print the accuracy per topic
lab_utils.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model)

ACCURACY PER TOPIC:

ENTERTAINMENT: 90.87 
HEALTH: 88.67 
TECHNOLOGY: 92.27 
WORLD: 84.50 
BUSINESS: 85.70 
SPORTS: 92.83 
NATION: 82.77 
SCIENCE: 84.77 
