In [1]:
%%capture
!pip install nltk transformers
!pip install sentencepiece

In [2]:
import nltk
from nltk.corpus import brown
from transformers import TFDistilBertModel, DistilBertTokenizer, DistilBertConfig
import numpy as np
import csv
from nltk.tokenize import word_tokenize
import os
from tqdm import tqdm

import tensorflow as tf

In [3]:
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# Set global policy
policy = Policy('mixed_float16')
set_global_policy(policy)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
model_name = "distilbert-base-uncased"

original_config = DistilBertConfig.from_pretrained(model_name)

In [7]:
# Create a new configuration with reduced hidden size
reduced_hidden_size = 64  # Replace with your desired hidden size
reduced_num_hidden_layers = 2  # Choose a smaller number
reduced_num_attention_heads = 2  # Choose a smaller number
new_config = DistilBertConfig(
    vocab_size=original_config.vocab_size,
    hidden_size=reduced_hidden_size,
    num_hidden_layers=reduced_num_hidden_layers,
    num_attention_heads=reduced_num_attention_heads,
)

In [8]:
#model = TFDistilBertModel.from_pretrained(model_name, config=new_config, ignore_mismatched_sizes=True)
model = TFDistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [9]:
def embed_words(words):
    tokens = tokenizer.batch_encode_plus(words, padding='max_length', truncation=True, return_tensors='tf', max_length=8)
    outputs = model(tokens['input_ids'])
    embeddings = outputs.last_hidden_state
    averaged_embeddings = tf.reduce_mean(embeddings, axis=1)
    return averaged_embeddings

In [10]:
dataset_path = r"/content/drive/MyDrive/Reuters-21578/reuters/reuters/reuters/training"
dataset_dirs = os.listdir(dataset_path)

In [11]:
data = []
for i in tqdm(dataset_dirs):
    with open(f"{dataset_path}/{i}", 'r') as f:
        content = f.read()
        data.append(content)

100%|██████████| 7769/7769 [00:19<00:00, 406.76it/s]


In [12]:
words = []
for text in data:
  temp_words = word_tokenize(text)
  words.append(temp_words)

In [13]:
from itertools import chain

words_flattened = list(chain(*words))


In [14]:
len(words_flattened)

1135633

In [15]:
words_flattened = [str(element) for element in words_flattened]

In [16]:
%tensorflow_version 2.x
import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
TensorFlow version: 2.14.0
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [17]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

In [28]:
with tf.device('/device:GPU:0'):
  embeddings = embed_words(words_flattened[:10000])

In [29]:
embeddings

<tf.Tensor: shape=(10000, 768), dtype=float16, numpy=
array([[ 0.3096 , -0.1549 ,  0.0838 , ...,  0.1364 ,  0.3267 , -0.1741 ],
       [ 0.398  , -0.0443 ,  0.04486, ...,  0.07684,  0.1287 , -0.03717],
       [ 0.3171 , -0.06287,  0.2588 , ...,  0.04208,  0.0778 , -0.1112 ],
       ...,
       [ 0.3506 , -0.0709 ,  0.273  , ...,  0.1061 ,  0.181  , -0.1644 ],
       [ 0.171  , -0.1613 ,  0.2006 , ...,  0.14   ,  0.0867 , -0.0689 ],
       [ 0.4185 ,  0.0651 ,  0.3293 , ...,  0.0606 ,  0.1231 , -0.08514]],
      dtype=float16)>

In [32]:
# Save embeddings and metadata in TSV format
tsv_file = 'word_embeddings.tsv'
metadata_file = 'metadata.tsv'

# Save embeddings
np.savetxt(tsv_file, embeddings, delimiter='\t')

# Save metadata
with open(metadata_file, 'w', encoding='utf-8', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerows([[word] for word in words_flattened[:10000]])

In [31]:
len(embeddings)

10000