In [None]:
import os
import shutil


def check_create_dir(dir):
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.mkdir(dir)


dataset = 'wikitext-2'
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, dataset)
check_create_dir(data_dir)
os.chdir(data_dir)
print("Current directory: ", os.getcwd())

Current directory:  /home/ec2-user/SageMaker/wikitext-2


In [None]:
print("list dir: ", os.listdir(os.getcwd()))


list dir:  []


In [None]:
def is_document_start(line):
    if len(line) < 4:
        return False
    if line[0] == '=' and line[-1] == '=':
        if line[2] != '=':
            return True
        else:
            return False
    else:
        return False

In [None]:
def token_list_per_doc(input_dir, token_file):
    lines_list = []
    line_prev = ''
    prev_line_start_doc = False
    with open(os.path.join(input_dir, token_file), 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if prev_line_start_doc and line:
                lines_list.pop()
                lines_list[-1] = lines_list[-1] + ' ' + line_prev
            if line:
                if is_document_start(line) and not line_prev:
                    lines_list.append(line)
                    prev_line_start_doc = True
                else:
                    lines_list[-1] = lines_list[-1] + " " + line
                    prev_line_start_doc = False
            else:
                prev_line_start_doc = False
            line_prev = line
    print("{} documents parsed!".format(len(lines_list)))
    return lines_list

In [None]:
# path for the train file, validation file and test file
train_file = 'wiki.train.tokens'
val_file = 'wiki.valid.tokens'
test_file = 'wiki.test.tokens'

# parse documents
train_doc_list = token_list_per_doc(data_dir, train_file)
val_doc_list = token_list_per_doc(data_dir, val_file)
test_doc_list = token_list_per_doc(data_dir, test_file)

600 documents parsed!
60 documents parsed!
60 documents parsed!


In [None]:
!pip install nltk



In [None]:
# Take a quick search on nltk. What does it do?
import nltk

nltk.download('wordnet')
print("done")
from nltk.stem import WordNetLemmatizer
import re

token_pattern = re.compile(r"(?u)\b\w+\b")


class LemmaTokenier(object):
    def __init__(self):
        # examples: https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet
        self.wnl = WordNetLEmmatizer

    def _call__(self, doc):
        return [
            self.unl.lemmatie(t)
            for t in doc.split()
            if len(t) >= 2 and re.match("[a-z].*", t) and re.match(tocken_pattern, t)
                                        ]

done


[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

print("Lemmatizing and counting, this may take a few minutes...")

start_time = time.time()
# https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vectorizer = CountVectorizer(
    input = 'content',
    analyzer = 'word',
    stop_words = 'english',
    #tokenixer = LemmaTokenizer(),
    max_df = 0.9,
    min_df = 3,
)

train_vectors = vectorizer.fit_transform(train_doc_list)
val_vectors = vectorizer.transform(val_doc_list)
test_vectors = vectorizer.transform(test_doc_list)

vocab_list = vectorizer.get_feature_names_out()
vocab_size = len(vocab_list)
print('vocab size', vocab_size)
print('Done. Time elapsed:: {:.2}s'.format(time.time() - start_time))

Lemmatizing and counting, this may take a few minutes...
vocab size 20439
Done. Time elapsed:: 2.0s


In [None]:
import scipy.sparse as sparse

def shuffle_and_dtype(vectors):
    # takes a 2D array vectors as input and performs two main operations.
    # First, it shuffles the rows of the array randomly by creating a shuffled index and reordering the rows accordingly.
    # Then, it converts the shuffled array into a sparse matrix of type csr_matrix (Compressed Sparse Row format) with a data type of float32
    # Finally it prints its type and data type, and returns the result.


train_vectors = shuffle_and_dtype(train_vectors)
val_vectors = shuffle_and_dtype(val_vectors)
test_vectors = shuffle_and_dtype(test_vectors)

<class 'scipy.sparse._csr.csr_matrix'> float32
<class 'scipy.sparse._csr.csr_matrix'> float32
<class 'scipy.sparse._csr.csr_matrix'> float32


In [None]:
import io
import sagemaker.amazon.common as smac

def split_convert(sparray, prefix, fname_template="data_part{}.pbr", n_parts=2):
    # The function divides a sparse array (sparray) into n_parts equal-sized chunks, with the last chunk adjusted to include any remaining rows if the division isn’t exact.
    #    Conversion to Sparse Tensor: For each chunk, it converts the data from sparse matrix format into a sparse tensor format using the smac.write_spmatrix_to_sparse_tensor method, storing it in an in-memory buffer (buf).
    #    Saving to Disk: The buffer's content is then written to a file with a name based on the fname_template and stored in the directory specified by prefix.
    #    File Naming and Structure: The file names follow a consistent template (data_part{}.pbr), where {} is replaced by the current part number (e.g., data_part0.pbr, data_part1.pbr, etc.).
    #    Logging Progress: After saving each part, the function prints a confirmation message indicating where the data has been saved.
    #    This function is designed to process large sparse arrays by splitting them and saving them in a format optimized for downstream use.
    chunk_size = sparray.shape[0] // n_parts
    for i in range(n_parts):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        if i + 1 == n_parts:
            end = sparray.shape[0]

        buf = io.BytesIO()
        smac.write_spmatrix_to_sparse_tensor(array=sparray[start:end], file=buf, labels=None)
        buf.seek(0)

        fname = os.path.join(prefix, fname_template.format(i))
        with open(fname, 'wb') as f:
            f.write(buf.getvalue())
        print("Saved data to {}".format(fname))


train_data_dir = os.path.join(data_dir, 'train')
val_data_dir = os.path.join(data_dir, 'Validation')
test_data_dir = os.path.join(data_dir, 'Test')

check_create_dir(train_data_dir)
check_create_dir(val_data_dir)
check_create_dir(test_data_dir)

split_convert(train_vectors, prefix=train_data_dir, fname_template= "train_part{}.pdr", n_parts=4)
split_convert(val_vectors, prefix=val_data_dir, fname_template= "val_part{}.pdr", n_parts=1)
split_convert(test_vectors, prefix=test_data_dir, fname_template= "test_part{}.pdr", n_parts=1)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Saved data to /home/ec2-user/SageMaker/wikitext-2/train/train_part0.pdr
Saved data to /home/ec2-user/SageMaker/wikitext-2/train/train_part1.pdr
Saved data to /home/ec2-user/SageMaker/wikitext-2/train/train_part2.pdr
Saved data to /home/ec2-user/SageMaker/wikitext-2/train/train_part3.pdr
Saved data to /home/ec2-user/SageMaker/wikitext-2/Validation/val_part0.pdr
Saved data to /home/ec2-user/SageMaker/wikitext-2/Test/test_part0.pdr


In [None]:
aux_data_dir = os.path.join(data_dir, "auxiliary")
check_create_dir(aux_data_dir)
with open(os.path.join(aux_data_dir, 'vocab.txt'), "w", encoding="utf-8") as f:
          for item in vocab_list:
              f.write(item + "\n")

In [None]:
import os
import sagemaker

role = sagemaker.get_execution_role()

bucket = sagemaker.Session().default_bucket()
prefix = "ntm/" + dataset

train_prefix = os.path.join(prefix, "train")
val_prefix = os.path.join(prefix, "val")
aux_prefix = os.path.join(prefix, "auxiliary")
test_prefix = os.path.join(prefix, 'test')
output_prefix = os.path.join(prefix, "output")

s3_train_data = os.path.join("s3://", bucket, train_prefix)
s3_val_data = os.path.join("s3://", bucket, val_prefix)
s3_aux_data = os.path.join("s3://", bucket, aux_prefix)
s3_test_data = os.path.join("s3://", bucket, test_prefix)
output_path= os.path.join("s3://", bucket, output_prefix)

print('Training set locations', s3_train_data)
print('Validation set locations', s3_val_data)
print('Auxiliary set locations', s3_aux_data)
print('Test set locations', s3_test_data)
print('Trained set locations', output_path)


Training set locations s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/train
Validation set locations s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/val
Auxiliary set locations s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/auxiliary
Test set locations s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/test
Trained set locations s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/output


In [None]:
import subprocess

cmd_train = "aws s3 cp " + train_data_dir + " " + s3_train_data + " --recursive"
p = subprocess.Popen(cmd_train, shell=True, stdout=subprocess.PIPE)
p.communicate()

(b'Completed 256.0 KiB/2.3 MiB (1.8 MiB/s) with 4 file(s) remaining\rCompleted 512.0 KiB/2.3 MiB (3.5 MiB/s) with 4 file(s) remaining\rCompleted 768.0 KiB/2.3 MiB (5.2 MiB/s) with 4 file(s) remaining\rCompleted 1.0 MiB/2.3 MiB (6.8 MiB/s) with 4 file(s) remaining  \rCompleted 1.2 MiB/2.3 MiB (8.5 MiB/s) with 4 file(s) remaining  \rCompleted 1.5 MiB/2.3 MiB (10.1 MiB/s) with 4 file(s) remaining \rCompleted 1.6 MiB/2.3 MiB (7.0 MiB/s) with 4 file(s) remaining  \rupload: train/train_part3.pdr to s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/train/train_part3.pdr\nCompleted 1.6 MiB/2.3 MiB (7.0 MiB/s) with 3 file(s) remaining\rCompleted 1.8 MiB/2.3 MiB (7.9 MiB/s) with 3 file(s) remaining\rCompleted 2.1 MiB/2.3 MiB (8.9 MiB/s) with 3 file(s) remaining\rCompleted 2.1 MiB/2.3 MiB (9.3 MiB/s) with 3 file(s) remaining\rupload: train/train_part1.pdr to s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/train/train_part1.pdr\nCompleted 2.1 MiB/2.3 MiB (9.3 MiB/s) with 2 file(s) remainin

In [None]:
cmd_val = "aws s3 cp " + val_data_dir + " " + s3_val_data + " --recursive"
p = subprocess.Popen(cmd_val, shell=True, stdout=subprocess.PIPE)
p.communicate()

(b'Completed 238.8 KiB/238.8 KiB (2.0 MiB/s) with 1 file(s) remaining\rupload: Validation/val_part0.pdr to s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/val/val_part0.pdr\n',
 None)

In [None]:
cmd_test = "aws s3 cp " + test_data_dir + " " + s3_test_data + " --recursive"
p = subprocess.Popen(cmd_test, shell=True, stdout=subprocess.PIPE)
p.communicate()

(b'Completed 247.9 KiB/247.9 KiB (1.5 MiB/s) with 1 file(s) remaining\rupload: Test/test_part0.pdr to s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/test/test_part0.pdr\n',
 None)

In [None]:
cmd_aux = "aws s3 cp " + aux_data_dir + " " + s3_aux_data + " --recursive"
p = subprocess.Popen(cmd_aux, shell=True, stdout=subprocess.PIPE)
p.communicate()

(b'Completed 164.1 KiB/164.1 KiB (1.2 MiB/s) with 1 file(s) remaining\rupload: auxiliary/vocab.txt to s3://sagemaker-us-west-2-792421635322/ntm/wikitext-2/auxiliary/vocab.txt\n',
 None)

In [None]:
import boto3
from sagemaker.image_uris import retrieve

container = retrieve('ntm', boto3.Session().region_name)

In [None]:
sess = sagemaker.Session()
# Create a SageMaker Estimator object for training a machine learning model
# specifying the Docker container, IAM role, instance type, instance count,
# output path for model artifacts, and the SageMaker session.


In [None]:
num_topics = 20
# One line code to sets the hyperparameters for a Neural Topic Model (NTM),
#   specifying the number of topics,
#   input feature dimension (vocab_size)
#   mini-batch size = 60
#   number of training epochs = 50
#   sub-sampling ratio = 0.7
# for training.


In [None]:
from sagemaker.inputs import TrainingInput

s3_train = TrainingInput(s3_train_data, distribution="ShardedByS3Key", content_type="application/x-recordio-protobuf")
s3_val = TrainingInput(s3_val_data, distribution="FullyReplicated",    content_type="application/x-recordio-protobuf")
s3_test = TrainingInput(s3_test_data, distribution="FullyReplicated",  content_type="application/x-recordio-protobuf")
s3_aux = TrainingInput(s3_aux_data, distribution="FullyReplicated",    content_type="application/x-recordio-protobuf")

In [None]:
ntm.fit({
    'train': s3_train,
    'validation': s3_val,
    'auxiliary': s3_aux,
    'test': s3_test,
})

INFO:sagemaker:Creating training-job with name: ntm-2024-11-15-03-31-32-200


2024-11-15 03:31:33 Starting - Starting the training job...
2024-11-15 03:31:47 Starting - Preparing the instances for training...
2024-11-15 03:32:34 Downloading - Downloading the training image.....................
2024-11-15 03:36:06 Training - Training image download completed. Training in progress...[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[11/15/2024 03:36:12 INFO 139684292245312] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/default-input.json: {'encoder_layers': 'auto', 'mini_batch_size': '256', 'epochs': '50', 'encoder_layers_activation': 'sigmoid', 'optimizer': 'adadelta', 'tolerance': '0.001', 'num_patience_epochs': '3', 'batch_norm': 'false', 'rescale_gradient': '1.0', 'clip_gradient': 'Inf', 'weight_decay': '0.0', 'learning_rate': '0.01', 'sub_sample': '1.0', '_tuning_objective_metric': '', '_data_format'

In [None]:
print("Training job name: {}".format(ntm.latest_training_job.job_name))

Training job name: ntm-2024-11-15-03-31-32-200
