In [65]:
#import packages
import os
import glob
import json
import re
import pickle
import nltk
import torch
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize, sent_tokenize
from dateutil import parser
import wandb

# Transformers
from transformers import (
    BertTokenizerFast,
    BertTokenizer,
    BertForMaskedLM,
    BertModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset

nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaurm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Data Extraction & Preprocessing

In [68]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def preprocess(group_name, sender, text):
    group_name = str(group_name).lower()
    sender = str(sender).lower()
    
    # Remove emojis
    text = remove_emojis(text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove non-alphanumeric (but keep spaces)
    text = re.sub(r'\W+', ' ', text.lower())

    # Tokenize
    tokens = word_tokenize(text)
    
    # Include group_name, sender as the first tokens
    return [group_name, sender] + tokens

In [70]:
def load_and_preprocess_jsons_in_folder(folder_path):

    all_messages = []
    
    # Gather all .json files
    json_files = glob.glob(os.path.join(folder_path, "*.json"))
    
    for file_path in json_files:
        try:
            with open(file_path, 'r', encoding="utf-8") as file:
                data = json.load(file)
                group_name = data.get('name', 'unknown')
                
                messages = data.get('messages', [])
                for message in messages:
                    sender = message.get('from', 'unknown')
                    text_content = message.get('text', '')
                    
                    # text might be a list
                    if isinstance(text_content, list):
                        text_content = " ".join(
                            part['text'] if isinstance(part, dict) else part
                            for part in text_content
                        )
                    
                    if text_content:
                        all_messages.append({
                            'name': group_name,
                            'from': sender,
                            'text': text_content
                        })
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
        except Exception as e:
            print(f"Unexpected error with {file_path}: {e}")

    # Build DataFrame
    if not all_messages:
        print(f"No usable messages in folder: {folder_path}")
        return None, []

    df = pd.DataFrame(all_messages)
    
    # Preprocess each row: add tokenized text in new column
    df['Tokenized_Text'] = df.apply(
        lambda row: preprocess(row['name'], row['from'], row['text']), axis=1
    )
    
    # Build a "corpus" (list of token lists)
    corpus = df['Tokenized_Text'].tolist()
    
    print(f"Folder: {folder_path} -> {len(df)} messages loaded.")
    return df, corpus

#### Dataset for BERT

In [73]:
class TextDataset(Dataset):
    def __init__(self, text, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Use NLTK's sentence tokenizer
        self.sentences = sent_tokenize(text)

        # Tokenize all at once
        self.inputs = self.tokenizer(
            self.sentences,
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
            padding=True,
            is_split_into_words=False
        )

    def __len__(self):
        return len(self.inputs.input_ids)

    def __getitem__(self, idx):
        input_ids = self.inputs.input_ids[idx]
        attention_mask = self.inputs.attention_mask[idx]

        # For masked LM, labels are the same as input_ids before random masking
        labels = input_ids.clone()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

#### Incremental Training Function

In [17]:
def incremental_train_bert(
    parent_directory_path,
    output_root_dir="./incremental_model_checkpoints",
    base_model_name='bert-base-uncased',
    num_train_epochs=3,
    batch_size=16
):
    import os
    import pickle
    import wandb
    from transformers import (
        BertTokenizer,
        BertForMaskedLM,
        DataCollatorForLanguageModeling,
        Trainer,
        TrainingArguments,
    )
    from torch.utils.data import Dataset
    import torch
    from nltk.tokenize import sent_tokenize

    # nltk.download('punkt')

    # Gather subfolders (months)
    subfolders = [
        f for f in sorted(os.listdir(parent_directory_path))
        if os.path.isdir(os.path.join(parent_directory_path, f))
    ]
    if not subfolders:
        print("No subfolders found under:", parent_directory_path)
        return

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load the base model & tokenizer
    print(f"Loading base model/tokenizer: {base_model_name}")
    tokenizer = BertTokenizer.from_pretrained(base_model_name)
    model = BertForMaskedLM.from_pretrained(base_model_name).to(device)

    # Data collator for masked LM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # Disable W&B (if you do not want to log)
    wandb.init(mode="disabled")

    # Define a simple TextDataset class
    class TextDataset(Dataset):
        def __init__(self, text, tokenizer, max_length=512):
            self.tokenizer = tokenizer
            self.max_length = max_length
            
            # Split text into sentences
            self.sentences = sent_tokenize(text)
            
            # Tokenize all sentences
            self.inputs = self.tokenizer(
                self.sentences,
                return_tensors="pt",
                max_length=self.max_length,
                truncation=True,
                padding=True,
                is_split_into_words=False
            )

        def __len__(self):
            return len(self.inputs.input_ids)

        def __getitem__(self, idx):
            input_ids = self.inputs.input_ids[idx]
            attention_mask = self.inputs.attention_mask[idx]

            # For masked LM, labels are the same as input_ids before masking
            labels = input_ids.clone()

            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }

    # Loop over each month folder
    for month_name in subfolders:
        folder_path = os.path.join(parent_directory_path, month_name)
        print(f"\n=== Processing: {month_name} ===")

        # Load and Preprocess
        df, month_corpus = load_and_preprocess_jsons_in_folder(folder_path)
        if df is None or not month_corpus:
            print(f"No data in {month_name}, skipping.")
            continue

        # Save the tokenized corpus (pkl) for this month
        tokenized_outpath = os.path.join(folder_path, f"tokenized_corpus_{month_name}.pkl")
        with open(tokenized_outpath, "wb") as f:
            pickle.dump(month_corpus, f)
        print(f"Tokenized corpus saved to: {tokenized_outpath}")

        # Build one big text for BERT training
        all_text_corpus = ' '.join([' '.join(sentence) for sentence in month_corpus]).lower()

        # Create PyTorch Dataset
        dataset = TextDataset(all_text_corpus, tokenizer, max_length=512)
        if len(dataset) == 0:
            print(f"No valid sentences in {month_name}, skipping.")
            continue
        
        print(f"Training dataset size for {month_name}: {len(dataset)} sentences.")

        # Training arguments
        output_dir = os.path.join(output_root_dir, f"checkpoint-{month_name}")
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=batch_size,
            save_steps=100,
            save_total_limit=2,
            logging_dir=os.path.join(output_root_dir, "logs"),
            logging_steps=100,
            weight_decay=0.01,
        )

        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=dataset
        )

        # Train and captu
        train_output = trainer.train()
        final_loss = train_output.training_loss  # Average loss across training

        print(f"Finished training for {month_name}. Final training loss: {final_loss:.4f}")

        # SAVE CHECKPOINT
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Checkpoint saved: {output_dir}")

        # Reload the newly trained model for the next month
        model = BertForMaskedLM.from_pretrained(output_dir).to(device)

    print("\nAll done with incremental training!")


#### Main Function (for all months in the folder)

In [19]:
# Set your parent directory path
parent_directory_path = "C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly"  

# Run incremental training
incremental_train_bert(
    parent_directory_path=parent_directory_path,
    output_root_dir="./incremental_checkpoints",
    base_model_name='bert-base-uncased',
    num_train_epochs=3,
    batch_size=16
)

Using device: cuda
Loading base model/tokenizer: bert-base-uncased


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



=== Processing: 2023-01 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-01\result - 2025-02-11T131848.154.json: Expecting value: line 5 column 15 (char 133)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-01\result - 2025-02-11T131848.154_1.json: Expecting value: line 5 column 15 (char 133)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-01 -> 548771 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-01\tokenized_corpus_2023-01.pkl
Training dataset size for 2023-01: 5535 sentences.


Step,Training Loss
100,3.0179
200,2.3882
300,2.1366
400,1.9829
500,1.8486
600,1.7747
700,1.6971
800,1.6334
900,1.6097
1000,1.5511


Finished training for 2023-01. Final training loss: 1.9498
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-01

=== Processing: 2023-02 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-02\result (46).json: Expecting value: line 5 column 15 (char 96)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-02 -> 552997 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-02\tokenized_corpus_2023-02.pkl
Training dataset size for 2023-02: 4039 sentences.


Step,Training Loss
100,1.8388
200,1.4942
300,1.3858
400,1.2934
500,1.2729
600,1.173
700,1.1685


Finished training for 2023-02. Final training loss: 1.3605
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-02

=== Processing: 2023-03 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-03\result (69).json: Expecting value: line 5 column 15 (char 96)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-03\result (83).json: Expecting value: line 5 column 15 (char 89)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-03\result_200.json: Expecting value: line 5 column 15 (char 133)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-03\result_200_1.json: Expecting value: line 5 column 15 (char 133)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-03 -> 760816 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Resea

Step,Training Loss
100,2.1145
200,1.915
300,1.7588
400,1.7072
500,1.6498
600,1.6308


Finished training for 2023-03. Final training loss: 1.7782
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-03

=== Processing: 2023-04 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-04\result_37.json: Expecting value: line 5 column 15 (char 94)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-04\result_37_1.json: Expecting value: line 5 column 15 (char 94)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-04 -> 382092 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-04\tokenized_corpus_2023-04.pkl
Training dataset size for 2023-04: 4242 sentences.


Step,Training Loss
100,1.9711
200,1.7338
300,1.5999
400,1.5419
500,1.4442
600,1.3911
700,1.3732


Finished training for 2023-04. Final training loss: 1.5535
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-04

=== Processing: 2023-05 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-05\result_101.json: Expecting value: line 5 column 15 (char 86)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-05\result_101_1.json: Expecting value: line 5 column 15 (char 86)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-05 -> 373781 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-05\tokenized_corpus_2023-05.pkl
Training dataset size for 2023-05: 3301 sentences.


Step,Training Loss
100,1.8106
200,1.5874
300,1.4454
400,1.3358
500,1.2943
600,1.2821


Finished training for 2023-05. Final training loss: 1.4523
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-05

=== Processing: 2023-06 ===
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-06 -> 352435 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-06\tokenized_corpus_2023-06.pkl
Training dataset size for 2023-06: 2664 sentences.


Step,Training Loss
100,1.6189
200,1.4265
300,1.3193
400,1.2573
500,1.2533


Finished training for 2023-06. Final training loss: 1.3723
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-06

=== Processing: 2023-07 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-07\result (27).json: Expecting value: line 5 column 15 (char 90)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-07\result (7).json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-07\result_203.json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-07\result_203_1.json: Expecting value: line 5 column 15 (char 85)
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-07\result_333.json: Expecting value: line 5 column 15 (char 94)
Error decoding J

Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0


Finished training for 2023-07. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-07

=== Processing: 2023-08 ===
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-08 -> 376277 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-08\tokenized_corpus_2023-08.pkl
Training dataset size for 2023-08: 4250 sentences.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0


Finished training for 2023-08. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-08

=== Processing: 2023-09 ===
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-09 -> 174390 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-09\tokenized_corpus_2023-09.pkl
Training dataset size for 2023-09: 2269 sentences.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0


Finished training for 2023-09. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-09

=== Processing: 2023-10 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-10\result (45).json: Expecting ',' delimiter: line 786 column 4 (char 18195)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-10 -> 247677 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-10\tokenized_corpus_2023-10.pkl
Training dataset size for 2023-10: 4026 sentences.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0


Finished training for 2023-10. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-10

=== Processing: 2023-11 ===
Error decoding JSON from C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-11\result (34).json: Expecting value: line 5 column 15 (char 107)
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-11 -> 242553 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-11\tokenized_corpus_2023-11.pkl
Training dataset size for 2023-11: 9405 sentences.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


Finished training for 2023-11. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-11

=== Processing: 2023-12 ===
Folder: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-12 -> 173790 messages loaded.
Tokenized corpus saved to: C:/Users/kaurm/OneDrive/Desktop/Research/Paper3/Model/JSON/JSON-Monthly\2023-12\tokenized_corpus_2023-12.pkl
Training dataset size for 2023-12: 1709 sentences.


Step,Training Loss
100,0.0
200,0.0
300,0.0


Finished training for 2023-12. Final training loss: 0.0000
Checkpoint saved: ./incremental_checkpoints\checkpoint-2023-12

All done with incremental training!


#### See Part 2 for Remaining Months and Generate Embeddings From Final Model