In [5]:
# import packages
import os
import glob
import json
import re
import pickle
import nltk
import torch
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, sent_tokenize
from dateutil import parser
import wandb

# Transformers
from transformers import (
    BertTokenizerFast,
    BertTokenizer,
    BertForMaskedLM,
    BertModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset

# nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


#### Data Extraction & Preprocessing

In [4]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess(group_name, sender, text):
    group_name = str(group_name).lower()
    sender = str(sender).lower()
    
    # Remove emojis
    text = remove_emojis(text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove non-alphanumeric (but keep spaces)
    text = re.sub(r'\W+', ' ', text.lower())

    # Tokenize
    tokens = word_tokenize(text)
    
    # Include group_name, sender as the first tokens
    return [group_name, sender] + tokens

In [6]:
def load_and_preprocess_jsons_in_folder(folder_path):

    all_messages = []
    
    # Gather all .json files
    json_files = glob.glob(os.path.join(folder_path, "*.json"))
    
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding="utf-8") as file:
                data = json.load(file)
                group_name = data.get('name', 'unknown')
                
                messages = data.get('messages', [])
                for message in messages:
                    sender = message.get('from', 'unknown')
                    text_content = message.get('text', '')
                    
                    # text might be a list
                    if isinstance(text_content, list):
                        text_content = " ".join(
                            part['text'] if isinstance(part, dict) else part
                            for part in text_content
                        )
                    
                    if text_content:
                        all_messages.append({
                            'name': group_name,
                            'from': sender,
                            'text': text_content
                        })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
        except Exception as e:
            print(f"Unexpected error with {file_path}: {e}")

    # Build DataFrame
    if not all_messages:
        print(f"No usable messages in folder: {folder_path}")
        return None, []

    df = pd.DataFrame(all_messages)
    
    # Preprocess each row: add tokenized text in new column
    df['Tokenized_Text'] = df.apply(
        lambda row: preprocess(row['name'], row['from'], row['text']), axis=1
    )
    
    # Build a "corpus" (list of token lists)
    corpus = df['Tokenized_Text'].tolist()
    
    print(f"Folder: {folder_path} -> {len(df)} messages loaded.")
    return df, corpus

#### Dataset for BERT

In [9]:
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Use NLTK's sentence tokenizer
        self.sentences = sent_tokenize(text)

        # Tokenize all at once
        self.inputs = self.tokenizer(
            self.sentences,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding=True,
            is_split_into_words=False
        )

    def __len__(self):
        return len(self.inputs.input_ids)

    def __getitem__(self, idx):
        input_ids = self.inputs.input_ids[idx]
        attention_mask = self.inputs.attention_mask[idx]

        # For masked LM, labels are the same as input_ids before random masking
        labels = input_ids.clone()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

#### Incremental Training Function

In [12]:
import os
import pickle
import wandb
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import torch
import nltk
from nltk.tokenize import sent_tokenize

# If needed:
# nltk.download('punkt')

def incremental_train_bert(
    parent_directory_path,
    output_root_dir="./incremental_model_checkpoints",
    base_model_name='bert-base-uncased',
    num_train_epochs=3,
    batch_size=16,
    start_month_name=None,
    initial_checkpoint_path=None
):

    # 1. Gather and sort subfolders
    subfolders = [
        f for f in sorted(os.listdir(parent_directory_path))
        if os.path.isdir(os.path.join(parent_directory_path, f))
    ]
    if not subfolders:
        print("No subfolders found under:", parent_directory_path)
        return

    # 2. Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 3. Determine which checkpoint to load at the start
    if initial_checkpoint_path and os.path.isdir(initial_checkpoint_path):
        # Load from an existing checkpoint (e.g. "checkpoint-June2023")
        print(f"Loading initial model from checkpoint: {initial_checkpoint_path}")
        tokenizer = BertTokenizer.from_pretrained(initial_checkpoint_path)
        model = BertForMaskedLM.from_pretrained(initial_checkpoint_path).to(device)
    else:
        # Load from base model
        print(f"Loading base model/tokenizer: {base_model_name}")
        tokenizer = BertTokenizer.from_pretrained(base_model_name)
        model = BertForMaskedLM.from_pretrained(base_model_name).to(device)

    # 4. Data collator for masked LM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # 5. Disable W&B (if you do not want to log)
    wandb.init(mode="disabled")

    # 6. TextDataset for BERT
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, text, tokenizer, max_length=512):
            self.tokenizer = tokenizer
            self.max_length = max_length
            
            # Split text into sentences
            self.sentences = sent_tokenize(text)
            
            # Tokenize
            self.inputs = self.tokenizer(
                self.sentences,
                return_tensors="pt",
                max_length=self.max_length,
                truncation=True,
                padding=True,
                is_split_into_words=False
            )

        def __len__(self):
            return len(self.inputs.input_ids)

        def __getitem__(self, idx):
            input_ids = self.inputs.input_ids[idx]
            attention_mask = self.inputs.attention_mask[idx]
            labels = input_ids.clone()  # For masked LM
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }

    # 8. Actually loop over subfolders
    start_collecting = True if (start_month_name is None) else False

    for month_name in subfolders:
        if not start_collecting:
            # We haven't reached the start_month_name yet
            if month_name == start_month_name:
                start_collecting = True
            else:
                print(f"Skipping {month_name} until we reach {start_month_name}...")
                continue

        folder_path = os.path.join(parent_directory_path, month_name)
        print(f"\n=== Processing month: {month_name} ===")

        # Load & Preprocess
        df, month_corpus = load_and_preprocess_jsons_in_folder(folder_path)
        if df is None or not month_corpus:
            print(f"No data for {month_name}; skipping.")
            continue

        # Save tokenized corpus
        tokenized_outpath = os.path.join(folder_path, f"tokenized_corpus_{month_name}.pkl")
        with open(tokenized_outpath, "wb") as f:
            pickle.dump(month_corpus, f)
        print(f"Tokenized corpus saved to: {tokenized_outpath}")

        # Build a text for BERT
        all_text_corpus = ' '.join([' '.join(sent) for sent in month_corpus]).lower()
        dataset = TextDataset(all_text_corpus, tokenizer, max_length=512)
        if len(dataset) == 0:
            print(f"No valid sentences in {month_name}, skipping.")
            continue

        print(f"Training dataset size for {month_name}: {len(dataset)} sentences.")

        # Training args
        output_dir = os.path.join(output_root_dir, f"checkpoint-{month_name}")
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            save_steps=100,
            save_total_limit=2,
            logging_dir=os.path.join(output_root_dir, "logs"),
            logging_steps=100,
            weight_decay=0.01,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=dataset
        )

        train_output = trainer.train()
        final_loss = train_output.training_loss

        print(f"Finished training for {month_name}. Final training loss: {final_loss:.4f}")

        # Save checkpoint
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Checkpoint saved: {output_dir}")

        # Reload the newly trained model for next iteration
        model = BertForMaskedLM.from_pretrained(output_dir).to(device)
        print(f"===== Done with {month_name} =====")

    print("\nAll done with incremental training!")

#### Main Function (starting from a previous checkpoint)

In [17]:
incremental_train_bert(
    parent_directory_path="D:\Paper3\Model\JSON\JSON-Monthly", # edit your PATH
    output_root_dir="./incremental_checkpoints",
    base_model_name='bert-base-uncased',
    num_train_epochs=3,
    batch_size=16,
    start_month_name="2023-06",                          # skip subfolders before "2023-06"
    initial_checkpoint_path="./incremental_checkpoints/checkpoint-2023-05" # edit your PATH
)

  parent_directory_path="D:\Paper3\Model\JSON\JSON-Monthly", # change the path


Using device: cuda
Loading base model/tokenizer: bert-base-uncased


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

Skipping 2023-01 until we reach 2023-06...
Skipping 2023-02 until we reach 2023-06...
Skipping 2023-03 until we reach 2023-06...
Skipping 2023-04 until we reach 2023-06...
Skipping 2023-05 until we reach 2023-06...

=== Processing month: 2023-06 ===
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-06 -> 352435 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-06\tokenized_corpus_2023-06.pkl
Training dataset size for 2023-06: 2664 sentences.




Step,Training Loss
100,2.9549
200,2.1752
300,1.8894
400,1.7517
500,1.7193


Finished training for 2023-06. Final training loss: 2.0976
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-06
===== Done with 2023-06 =====

=== Processing month: 2023-07 ===
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result (27).json: Expecting value: line 5 column 15 (char 90)
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result (7).json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result_203.json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result_203_1.json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result_333.json: Expecting value: line 5 column 15 (char 94)
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-07\result_333_1.json: Expecting value: line 5 column 15 (char 94)
Folder: D:\Paper3\Model\JSON\JSON-Monthl

Step,Training Loss
100,1.8115
200,1.5268
300,1.3314
400,1.2899
500,1.2289
600,1.1991


Finished training for 2023-07. Final training loss: 1.3739
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-07
===== Done with 2023-07 =====

=== Processing month: 2023-08 ===
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-08 -> 376277 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-08\tokenized_corpus_2023-08.pkl
Training dataset size for 2023-08: 4250 sentences.


Step,Training Loss
100,1.8527
200,1.6356
300,1.5589
400,1.4199
500,1.4093
600,1.3022
700,1.2904


Finished training for 2023-08. Final training loss: 1.4712
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-08
===== Done with 2023-08 =====

=== Processing month: 2023-09 ===
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-09 -> 174390 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-09\tokenized_corpus_2023-09.pkl
Training dataset size for 2023-09: 2269 sentences.


Step,Training Loss
100,1.2541
200,1.1921
300,1.0607
400,1.0125


Finished training for 2023-09. Final training loss: 1.1257
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-09
===== Done with 2023-09 =====

=== Processing month: 2023-10 ===
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-10\result (45).json: Expecting ',' delimiter: line 786 column 4 (char 18195)
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-10 -> 247677 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-10\tokenized_corpus_2023-10.pkl
Training dataset size for 2023-10: 4026 sentences.


Step,Training Loss
100,1.4911
200,1.2432
300,1.1705
400,1.0822
500,1.0185
600,1.0121
700,0.9757


Finished training for 2023-10. Final training loss: 1.1273
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-10
===== Done with 2023-10 =====

=== Processing month: 2023-11 ===
Error decoding JSON from D:\Paper3\Model\JSON\JSON-Monthly\2023-11\result (34).json: Expecting value: line 5 column 15 (char 107)
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-11 -> 242553 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-11\tokenized_corpus_2023-11.pkl
Training dataset size for 2023-11: 9405 sentences.


Step,Training Loss
100,1.4864
200,1.2595
300,1.112
400,1.064
500,1.0313
600,0.9617
700,0.8888
800,0.8986
900,0.8521
1000,0.8615


Finished training for 2023-11. Final training loss: 0.9100
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-11
===== Done with 2023-11 =====

=== Processing month: 2023-12 ===
Folder: D:\Paper3\Model\JSON\JSON-Monthly\2023-12 -> 173790 messages loaded.
Tokenized corpus saved to: D:\Paper3\Model\JSON\JSON-Monthly\2023-12\tokenized_corpus_2023-12.pkl
Training dataset size for 2023-12: 1709 sentences.


Step,Training Loss
100,1.1323
200,1.0461
300,0.9544


Finished training for 2023-12. Final training loss: 1.0439
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-12
===== Done with 2023-12 =====

All done with incremental training!


#### Generate Embeddings From Final Model

In [13]:
import torch, pickle, gzip, numpy as np
from collections import defaultdict
from pathlib import Path
from transformers import BertTokenizerFast, BertModel, logging as hf_logging
hf_logging.set_verbosity_error()


def build_embeddings_stream(
        model_dir: str,
        corpus_pkl: str,
        out_token_stream: str = "all_word_embeddings.pkl.gz",
        out_avg_embeds:   str = "word_to_avg_emb.pkl",
        log_every: int    = 1000
    ):
    if not torch.cuda.is_available():
        raise EnvironmentError("Need a GPU to run the encoder efficiently.")
    device = torch.device("cuda")

    tok  = BertTokenizerFast.from_pretrained(model_dir)
    mdl  = BertModel.from_pretrained(model_dir).to(device).eval()

    MAXLEN  = mdl.config.max_position_embeddings      # 512
    CHUNK   = MAXLEN - 2                              # room for CLS/SEP

    def chunk(lst, size=CHUNK):
        for i in range(0, len(lst), size):
            yield lst[i:i + size]

    # load tokenised corpus
    with open(corpus_pkl, "rb") as f:
        corpus = pickle.load(f)
    n_sent = len(corpus)
    print(f"Corpus: {n_sent:,} sentences")

    # open gzip stream for token-level matrices
    Path(out_token_stream).parent.mkdir(parents=True, exist_ok=True)
    stream = gzip.open(out_token_stream, "wb")

    running_sum  = defaultdict(lambda: np.zeros(768, dtype=np.float32))
    running_cnt  = defaultdict(int)

    for idx, tokens in enumerate(corpus, 1):
        sent_vecs = []

        # split long sentences into ≤510-token chunks
        for piece in chunk(tokens):
            enc = tok(piece, return_tensors="pt", is_split_into_words=True,
                      padding="max_length", truncation=True,
                      max_length=MAXLEN).to(device)

            with torch.no_grad():
                hidden = mdl(**enc).last_hidden_state[0]        # (512,768) # Extract from final hidden layer

            real = len(piece)
            vecs = hidden[1 : 1 + real].cpu().numpy().astype("float16") # 768-dimensional vectors

            sent_vecs.append(vecs)

            # update running sum / count
            for t, v in zip(piece, vecs):
                running_sum[t.lower()] += v.astype(np.float32)
                running_cnt[t.lower()] += 1

        # stream this sentence matrix and free memory
        pickle.dump(np.vstack(sent_vecs), stream)

        if log_every and idx % log_every == 0:
            print(f"  processed {idx:,}/{n_sent:,}")

    stream.close()
    print(f" token matrices streamed  →  {out_token_stream}")

    # write word-level averages (float16)
    word_to_avg = {w: (running_sum[w] / running_cnt[w]).astype("float16")
                   for w in running_sum}
    with open(out_avg_embeds, "wb") as f:
        pickle.dump(word_to_avg, f)
    print(f" {len(word_to_avg):,} word averages → {out_avg_embeds}")


# run per month 
build_embeddings_stream(
    model_dir        = "./incremental_checkpoints/checkpoint-2023-01",
    corpus_pkl       = "D:/Paper3/Model/JSON/JSON-Monthly/2023-01/tokenized_corpus_2023-01.pkl", # edit your PATH
    out_token_stream = "embeddings/2023-01/all_word_embeddings.pkl.gz",
    out_avg_embeds   = "embeddings/2023-01/word_to_avg_emb.pkl"
)

Corpus: 548,771 sentences
  processed 1,000/548,771
  processed 2,000/548,771
  processed 3,000/548,771
  processed 4,000/548,771
  processed 5,000/548,771
  processed 6,000/548,771
  processed 7,000/548,771
  processed 8,000/548,771
  processed 9,000/548,771
  processed 10,000/548,771
  processed 11,000/548,771
  processed 12,000/548,771
  processed 13,000/548,771
  processed 14,000/548,771
  processed 15,000/548,771
  processed 16,000/548,771
  processed 17,000/548,771
  processed 18,000/548,771
  processed 19,000/548,771
  processed 20,000/548,771
  processed 21,000/548,771
  processed 22,000/548,771
  processed 23,000/548,771
  processed 24,000/548,771
  processed 25,000/548,771
  processed 26,000/548,771
  processed 27,000/548,771
  processed 28,000/548,771
  processed 29,000/548,771
  processed 30,000/548,771
  processed 31,000/548,771
  processed 32,000/548,771
  processed 33,000/548,771
  processed 34,000/548,771
  processed 35,000/548,771
  processed 36,000/548,771
  processed

#### Example Terms for Nearest-Neighbor Lookup

In [27]:
# Utilities for nearest-neighbour lookup
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("bank", k=20) # add your word and run to see top 20 related terms

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'bank':
  credit               0.9645
  union                0.9576
  business             0.9539
  one                  0.9526
  card                 0.9525
  any                  0.9506
  first                0.9504
  with                 0.9498
  of                   0.9497
  and                  0.9481
  money                0.9476
  high                 0.9465
  wire                 0.9458
  all                  0.9421
  no                   0.9419
  good                 0.9394
  banks                0.9394
  check                0.9391
  pay                  0.9388
  dm                   0.9387


In [29]:
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("checks", k=20) # add your word and run to see top 20 related terms

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'checks':
  slips                0.9277
  cards                0.9065
  and                  0.9050
  with                 0.9037
  dumps                0.8981
  logs                 0.8971
  cashapp              0.8957
  to                   0.8931
  pin                  0.8915
  valid                0.8869
  or                   0.8857
  pay                  0.8857
  dm                   0.8856
  methods              0.8851
  card                 0.8841
  you                  0.8835
  apple                0.8819
  all                  0.8812
  clone                0.8811
  in                   0.8811


In [31]:
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("grubs", k=20) # add your word and run to see top 20 related terms

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'grubs':
  nights               0.8790
  swire                0.8644
  💯💯                   0.8538
  𝙿𝚊𝚜𝚜𝚠𝚘𝚛𝚍             0.8491
  🕷️                   0.8484
  neo                  0.8481
  🕷️success🕷️          0.8471
  gens                 0.8469
  shield               0.8458
  agecy                0.8445
  🥶big goat 🥶          0.8443
  wug💥🧤                0.8431
  checkspenfed         0.8415
  welly💚🥶😈             0.8401
  doncashbullet💰💎      0.8379
  erry                 0.8376
  qp                   0.8368
  vy                   0.8355
  darlington💎          0.8352
  quagenx              0.8347


In [33]:
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("drops", k=20)

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'drops':
  pnc                  0.9106
  instant              0.9105
  wells                0.9042
  federal              0.9029
  chase                0.9025
  boa                  0.9015
  navy                 0.8977
  cu                   0.8969
  aged                 0.8945
  usaa                 0.8918
  wire                 0.8918
  unions               0.8914
  week                 0.8912
  near                 0.8901
  pm                   0.8897
  asap                 0.8894
  old                  0.8862
  report               0.8859
  let                  0.8853
  banks                0.8853


In [37]:
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("CC", k=20)

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'CC':
  pay                  0.9476
  and                  0.9476
  apple                0.9414
  fullz                0.9408
  all                  0.9397
  cashapp              0.9341
  cards                0.9340
  with                 0.9327
  logs                 0.9303
  dm                   0.9295
  for                  0.9279
  to                   0.9278
  usa                  0.9264
  bank                 0.9211
  fresh                0.9208
  in                   0.9196
  available            0.9191
  uk                   0.9167
  are                  0.9160
  card                 0.9157


In [39]:
import pickle, numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity

AVG_PKL = Path("word_to_avg_emb.pkl")      # adjust filename if needed
if not AVG_PKL.exists():
    raise FileNotFoundError(f"{AVG_PKL} not found – run Cell 1 first.")

with AVG_PKL.open("rb") as f:
    word_to_avg = pickle.load(f)

vocab  = list(word_to_avg.keys())
matrix = np.vstack([word_to_avg[w] for w in vocab])   # (V, dim)
print(f"Vocabulary loaded – {len(vocab):,} tokens")

def top_related(word: str, k: int = 10):
    # Print top-k neighbours of *word*
    w = word.lower()
    if w not in word_to_avg:
        print(f"'{word}' not in vocab"); return
    sims  = cosine_similarity(word_to_avg[w].reshape(1,-1), matrix)[0]
    order = sims.argsort()[::-1]
    print(f"\nTop {k} terms similar to '{word}':")
    n = 0
    for idx in order:
        cand = vocab[idx]
        if cand == w: continue
        print(f"  {cand:<20} {sims[idx]:.4f}")
        n += 1
        if n == k: break

# example
top_related("fullz", k=20)

Vocabulary loaded – 30,427 tokens

Top 20 terms similar to 'fullz':
  cc                   0.9408
  bin                  0.9335
  uk                   0.9191
  logs                 0.9156
  bank                 0.9145
  card                 0.9130
  dl                   0.9128
  pay                  0.9127
  bins                 0.9119
  cards                0.9080
  type                 0.9035
  apple                0.9030
  pin                  0.9023
  and                  0.9019
  usa                  0.9016
  pros                 0.8983
  with                 0.8982
  payment              0.8977
  first                0.8973
  high                 0.8972
