In [None]:
# Packages needed for to store data

import lab_utils
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Working folder for the experiment (old model)
BASE_DIR = './E1'

# Get the subdirectories that contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

print(
    f'base directory: {BASE_DIR}\n\n'
    f'data: {data_dir}\n'
    f'model: {model_dir}\n'
    f'vocab: {vocab_dir}\n'
)

In [None]:
# Set the column width so you can see the entire length of the `title` column
pd.set_option('display.max_colwidth', None)

# Load the datasets into dataframes
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Preview the first 10 rows of the training set
train_df[:10]

In [None]:
train_df[['title', 'topic']]

In [None]:
# Load the model
model = tf.keras.models.load_model(model_dir)

# Show the model architecture
model.summary()

In [None]:
model.get_compile_config()

In [15]:
# Create a lookup list for the labels
topic_lookup = tf.keras.layers.StringLookup(vocabulary=f'{vocab_dir}/labels.txt', num_oov_indices=0)

# Check the list of labels
topic_lookup.get_vocabulary()

['ENTERTAINMENT',
 'HEALTH',
 'TECHNOLOGY',
 'WORLD',
 'BUSINESS',
 'SPORTS',
 'NATION',
 'SCIENCE']

In [16]:
import tensorflow as tf
import pandas as pd


# Title length and vocabulary size used by the team for the prototype
MAX_LENGTH = 20
VOCAB_SIZE = 10000

# Instantiate a layer for text preprocessing
title_preprocessor = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=MAX_LENGTH)

# Load the vocabulary file
vocab_path = f'{vocab_dir}/vocabulary.txt'
with open(vocab_path, 'r', encoding='utf-8') as file:
    vocab = [line.strip() for line in file]

# Set the vocabulary for the TextVectorization layer
title_preprocessor.set_vocabulary(vocab)

# Check the vocabulary size
print(f'Vocabulary size: {title_preprocessor.vocabulary_size()}')

# Get a sample title
sample_title = train_df['title'][10]

# Sample title in string format
print(f"Sample text: {sample_title}")

# Sample title represented as an integer sequence
print(f"Sample text (preprocessed): {title_preprocessor(sample_title).numpy()}")


Vocabulary size: 10000
Sample text: Dengue fever cases in Laos rise to 4256

Sample text (preprocessed): [4040 1979   30    2 9339  282    3    1    0    0    0    0    0    0
    0    0    0    0    0    0]


In [None]:
# Convert the test dataframe to a tf dataset
test_ds = lab_utils.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Get the metrics
model.evaluate(test_ds)

In [None]:
# Print the percentage of each class in the train set
train_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

In [None]:
# Print the percentage of each class in the test set
test_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

In [None]:
# Load the train and test sets
train_df = pd.read_csv(f'{data_dir}/train_data.csv')
test_df = pd.read_csv(f'{data_dir}/test_data.csv')

# Combine the two datasets. Set ignore_index to False.
combined_df = pd.concat([train_df,test_df], ignore_index=True)

train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['topic'])
train_df, dev_df = train_test_split(train_df, test_size=0.25, stratify=train_df['topic'])

In [None]:
train_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'


In [None]:
dev_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'


In [None]:
test_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str) + '%'

In [17]:
# Set the base directory for the second experiment
BASE_DIR = './E2'

# Set the subdirectories that will contain the experiment files
data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

# Save the datasets
lab_utils.save_data(train_df, data_dir, 'train_data.csv')
lab_utils.save_data(dev_df, data_dir, 'dev_data.csv')
lab_utils.save_data(test_df, data_dir, 'test_data.csv')

# Save the labels
lab_utils.save_labels(topic_lookup, vocab_dir)