# Setup

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, TFBertModel
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

## Data Loading

In [2]:
!pwd

  pid, fd = os.forkpty()


/kaggle/working


In [3]:
data = pd.read_csv("/kaggle/input/tubes-nlp/seq2seq_data.csv")
data

Unnamed: 0,topic_category,original_text,base_word_text
0,9.0,what makes friendship click?,what make friendship click
1,2.0,why does zebras have stripes?,why zebra stripe
2,4.0,what did the itsy bitsy sipder climb up?,what itsy bitsy sipder climb up
3,4.0,what is the difference between a bachelors and...,what difference between bachelor and master de...
4,3.0,why do women get pms?,why woman get pm
...,...,...,...
174712,9.0,imperative: tell me what guys only guys must do!,tell me what guy only guy must
174713,9.0,tell me the story of any fantasy figure i'd ch...,tell me story of any fantasy figure i d choose
174714,8.0,imperative: reveal a secret about life.,reveal secret about life
174715,6.0,imperative: demande à domenech ce qu'il en est...,demande à domenech ce quil en est de son méti...


# Data Preparation

## Data Cleaning

In [4]:
data.dropna(inplace=True)
data.isna().sum()

topic_category    0
original_text     0
base_word_text    0
dtype: int64

## Data Preprocessing

In [5]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Ensure you have the necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Define stopwords and punctuation
stop_words = set(stopwords.words('english'))
stop_words.update(["imperative", "declarative"])
punctuation = string.punctuation

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', punctuation))
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the preprocessing function to the 'original_text' column
data['processed_text'] = data['original_text'].apply(preprocess_text)
data[['original_text', 'processed_text']].head()

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,original_text,processed_text
0,what makes friendship click?,makes friendship click
1,why does zebras have stripes?,zebras stripes
2,what did the itsy bitsy sipder climb up?,itsy bitsy sipder climb
3,what is the difference between a bachelors and...,difference bachelors masters degree
4,why do women get pms?,women get pms


## Feature Extraction

##

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['processed_text'])
X = tokenizer.texts_to_sequences(data['processed_text'])
X = pad_sequences(X, maxlen=100)

In [8]:
# Load pre-trained GloVe embeddings
embedding_index = {}
with open('/kaggle/input/tubes-nlp/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [9]:
# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((5000, 100))
for word, i in word_index.items():
    if i < 5000:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### BERT Embedding

In [7]:
# Load BERT tokenizer and model
tokenizer_bert_embedding = BertTokenizer.from_pretrained('bert-base-uncased')
bert_embedding_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [8]:
# Tokenize input text

chunk_size = 30  # Adjust based on available memory
texts = list(data['processed_text'])
num_chunks = (len(texts)) // chunk_size + 1

all_embeddings = []
for chunk_idx in tqdm(range(num_chunks)):
    chunk_texts = texts[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size]
    inputs = tokenizer_bert_embedding(chunk_texts, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = bert_embedding_model(**inputs)
    all_embeddings.append(outputs.last_hidden_state)

100%|██████████| 5797/5797 [18:46<00:00,  5.15it/s]


### BERT

In [6]:
## BERT
from transformers import BertTokenizer, TFBertForSequenceClassification

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [7]:
class TopicClassificationDataset(Dataset):
    """Custom PyTorch Dataset for topic classification"""
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # Handle both integer and list indexing
        if isinstance(idx, list):
            texts = [self.texts[i] for i in idx]
            labels = [self.labels[i] for i in idx]
        else:
            texts = [self.texts[idx]]
            labels = [self.labels[idx]]
        
        # Batch encoding
        encodings = self.tokenizer(
            texts,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodings['input_ids'].squeeze(),
            'attention_mask': encodings['attention_mask'].squeeze(),
            'labels': torch.tensor(labels, dtype=torch.long).squeeze()
        }


## One Hot Encoding

In [10]:
from tensorflow.keras.utils import to_categorical
y = to_categorical([label - 1 for label in data['topic_category'].values], num_classes=10)

## Data Splitting

In [20]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Bert Embedding

In [11]:
y_bert_embedding = [y[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size] for chunk_idx in range (num_chunks)]

In [12]:
split_idx = int(num_chunks * 0.8) 

X_train_bert_embedding, X_test_bert_embedding = all_embeddings[:split_idx], all_embeddings[split_idx:-1]
y_train_bert_embedding, y_test_bert_embedding = y_bert_embedding[:split_idx], y_bert_embedding[split_idx:-1]

### BERT

In [8]:
sample_ratio = 0.05  
instances_per_class = int(len(data) * sample_ratio / 10)  # Calculate instances per class

# Sample data equally for each class
sample_data = data.groupby('topic_category').sample(n=instances_per_class, random_state=42)

# Ensure balanced test set
print(sample_data['topic_category'].value_counts())

topic_category
1.0     869
2.0     869
3.0     869
4.0     869
5.0     869
6.0     869
7.0     869
8.0     869
9.0     869
10.0    869
Name: count, dtype: int64


In [12]:
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(list(sample_data['processed_text']), [int(label - 1) for label in sample_data['topic_category'].values], test_size=0.2, random_state=42) 

In [13]:
# Prepare datasets
train_dataset = TopicClassificationDataset(X_train_bert, y_train_bert, bert_tokenizer)
val_dataset = TopicClassificationDataset(X_test_bert, y_test_bert, bert_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


# Model Development

## Long Short-Term Memory (LSTM)

In [21]:
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=5000, output_dim=100, weights=[embedding_matrix], input_length=100, trainable=False))
lstm_model.add(Bidirectional(tf.keras.layers.LSTM(200)))
lstm_model.add(Dense(100, activation='relu'))
lstm_model.add(Dense(10, activation='softmax'))

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [22]:
lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10
2174/2174 - 39s - 18ms/step - accuracy: 0.5550 - loss: 1.3731 - val_accuracy: 0.5772 - val_loss: 1.3020
Epoch 2/10
2174/2174 - 37s - 17ms/step - accuracy: 0.5917 - loss: 1.2566 - val_accuracy: 0.5907 - val_loss: 1.2625
Epoch 3/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6076 - loss: 1.2044 - val_accuracy: 0.5987 - val_loss: 1.2410
Epoch 4/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6252 - loss: 1.1439 - val_accuracy: 0.6004 - val_loss: 1.2390
Epoch 5/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6409 - loss: 1.0922 - val_accuracy: 0.6041 - val_loss: 1.2291
Epoch 6/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6566 - loss: 1.0356 - val_accuracy: 0.6051 - val_loss: 1.2451
Epoch 7/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6757 - loss: 0.9729 - val_accuracy: 0.6039 - val_loss: 1.2592
Epoch 8/10
2174/2174 - 37s - 17ms/step - accuracy: 0.6944 - loss: 0.9114 - val_accuracy: 0.5998 - val_loss: 1.2902
Epoch 9/10
2174/2174 - 37s - 17ms/step - accuracy: 0.7145 - loss: 0.8467 - val_a

<keras.src.callbacks.history.History at 0x7f79ac51eec0>

### LSTM using BERT Embedding

In [13]:
lstm_model_bert = Sequential()
lstm_model_bert.add(Input(shape=(None, 768))) # 768 is the hidden size of BERT-base
lstm_model_bert.add(Bidirectional(tf.keras.layers.LSTM(200)))
lstm_model_bert.add(Dense(100, activation='relu'))
lstm_model_bert.add(Dense(10, activation='softmax'))

lstm_model_bert.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
lstm_model_bert.fit(X_train_bert_embedding, y_train_bert_embedding, epochs=2, batch_size=chunk_size, validation_data=(X_test_bert_embedding, y_test_bert_embedding), verbose=2)

Epoch 1/2
1/1 - 34s - 34s/step - accuracy: 0.0333 - loss: 2.3634 - val_accuracy: 0.0667 - val_loss: 2.3802
Epoch 2/2
1/1 - 18s - 18s/step - accuracy: 0.6667 - loss: 1.8155 - val_accuracy: 0.1000 - val_loss: 2.4250


<keras.src.callbacks.history.History at 0x7cfb96046a10>

## Convolutional Neural Networks (CNNs)

In [16]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout

cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=5000, output_dim=100, input_length=100))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(50, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(10, activation='softmax'))

# Compile model
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
# Train model
cnn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/10
2174/2174 - 11s - 5ms/step - accuracy: 0.5028 - loss: 1.5549 - val_accuracy: 0.5926 - val_loss: 1.2849
Epoch 2/10
2174/2174 - 6s - 3ms/step - accuracy: 0.6032 - loss: 1.2847 - val_accuracy: 0.6032 - val_loss: 1.2405
Epoch 3/10
2174/2174 - 6s - 3ms/step - accuracy: 0.6289 - loss: 1.1874 - val_accuracy: 0.6057 - val_loss: 1.2377
Epoch 4/10
2174/2174 - 6s - 3ms/step - accuracy: 0.6536 - loss: 1.0931 - val_accuracy: 0.6076 - val_loss: 1.2426
Epoch 5/10
2174/2174 - 6s - 3ms/step - accuracy: 0.6798 - loss: 1.0028 - val_accuracy: 0.6060 - val_loss: 1.2719
Epoch 6/10
2174/2174 - 6s - 3ms/step - accuracy: 0.7055 - loss: 0.9130 - val_accuracy: 0.6043 - val_loss: 1.3517
Epoch 7/10
2174/2174 - 6s - 3ms/step - accuracy: 0.7288 - loss: 0.8282 - val_accuracy: 0.6004 - val_loss: 1.4379
Epoch 8/10
2174/2174 - 6s - 3ms/step - accuracy: 0.7505 - loss: 0.7557 - val_accuracy: 0.5974 - val_loss: 1.5291
Epoch 9/10
2174/2174 - 6s - 3ms/step - accuracy: 0.7678 - loss: 0.6999 - val_accuracy: 0.5939 -

<keras.src.callbacks.history.History at 0x7f79c02f0040>

## BERT

In [16]:
bert_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
bert_model.config.problem_type = "single_label_classification"  # For multi-class classification

In [18]:
# Optimizer
optimizer = AdamW(bert_model.parameters(), lr=2e-5)
epochs = 3



In [19]:
len(train_loader)

435

In [20]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

for epoch in range(epochs):
    bert_model.train()
    total_train_loss = 0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = bert_model(
            input_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        # print(f"training batch-{i+1}, epoch-{epoch+1}")
    # Validation
    bert_model.eval()
    total_val_loss = 0
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for j,batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = bert_model(
                input_ids, 
                attention_mask=attention_mask, 
                labels=labels
            )
            
            total_val_loss += outputs.loss.item()
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
            # print(f"validation batch-{j+1}, epoch-{epoch+1}")
    
    # Print epoch summary
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Average training loss: {avg_train_loss:.4f}')
    print(f'Average validation loss: {avg_val_loss:.4f}')

Epoch 1/3
Average training loss: 1.5310
Average validation loss: 1.2267
Epoch 2/3
Average training loss: 1.0109
Average validation loss: 1.2376
Epoch 3/3
Average training loss: 0.7431
Average validation loss: 1.2922


# Evaluation

## LSTM

In [None]:
lstm_predictions = lstm_model.predict(X_test)

In [24]:
lstm_predictions_processed = np.zeros_like(lstm_predictions)
lstm_predictions_processed[np.arange(len(lstm_predictions)), np.argmax(lstm_predictions, axis=1)] = 1
lstm_predictions_processed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [25]:
print("Classification Report:\n", classification_report(y_test, lstm_predictions_processed))

Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.45      0.48      3408
           1       0.52      0.66      0.58      3585
           2       0.63      0.64      0.64      3403
           3       0.46      0.43      0.44      3523
           4       0.76      0.76      0.76      3517
           5       0.78      0.72      0.75      3530
           6       0.56      0.40      0.47      3469
           7       0.55      0.59      0.57      3538
           8       0.57      0.64      0.61      3358
           9       0.62      0.67      0.64      3451

   micro avg       0.60      0.60      0.60     34782
   macro avg       0.60      0.60      0.59     34782
weighted avg       0.60      0.60      0.59     34782
 samples avg       0.60      0.60      0.60     34782



### LSTM using BERT Embedding

In [51]:
lstm_bert_predictions = None
for i in tqdm(range (len(X_test_bert_embedding))):
    lstm_bert_prediction = lstm_model_bert.predict(X_test_bert_embedding[i], verbose=0)
    if i == 0:
        lstm_bert_predictions = lstm_bert_prediction
    else:
        lstm_bert_predictions = np.vstack((lstm_bert_predictions, lstm_bert_prediction))

100%|██████████| 1159/1159 [01:10<00:00, 16.39it/s]


In [52]:
lstm_bert_predictions_processed = np.zeros_like(lstm_bert_predictions)
lstm_bert_predictions_processed[np.arange(len(lstm_bert_predictions)), np.argmax(lstm_bert_predictions, axis=1)] = 1

In [53]:
y_test_bert_embedding_unbatch = None
for i in range(len(y_test_bert_embedding)):
    if i ==0:
        y_test_bert_embedding_unbatch = y_test_bert_embedding[0]
    else:
        y_test_bert_embedding_unbatch = np.vstack((y_test_bert_embedding_unbatch, y_test_bert_embedding[i]))

In [54]:
print("Classification Report:\n", classification_report(y_test_bert_embedding_unbatch, lstm_bert_predictions_processed))

Classification Report:
               precision    recall  f1-score   support

           0       0.31      0.00      0.00      3848
           1       0.12      0.50      0.20      3478
           2       0.17      0.51      0.25      3558
           3       0.14      0.08      0.10      3409
           4       0.62      0.04      0.07      3443
           5       0.36      0.01      0.02      3832
           6       0.12      0.17      0.15      1974
           7       0.17      0.04      0.06      4016
           8       0.26      0.25      0.26      3439
           9       0.28      0.02      0.04      3773

   micro avg       0.16      0.16      0.16     34770
   macro avg       0.26      0.16      0.12     34770
weighted avg       0.26      0.16      0.11     34770
 samples avg       0.16      0.16      0.16     34770



## CNN

In [18]:
cnn_predictions = cnn_model.predict(X_test)


[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [19]:
cnn_predictions_processed = np.zeros_like(cnn_predictions)
cnn_predictions_processed[np.arange(len(cnn_predictions)), np.argmax(cnn_predictions, axis=1)] = 1
cnn_predictions_processed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [20]:
print("Classification Report:\n", classification_report(y_test, cnn_predictions_processed))

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.49      0.48      3408
           1       0.52      0.64      0.57      3585
           2       0.61      0.66      0.63      3403
           3       0.50      0.40      0.45      3523
           4       0.76      0.75      0.76      3517
           5       0.80      0.71      0.75      3530
           6       0.52      0.42      0.46      3469
           7       0.58      0.57      0.57      3538
           8       0.55      0.65      0.59      3358
           9       0.64      0.65      0.65      3451

   micro avg       0.59      0.59      0.59     34782
   macro avg       0.60      0.59      0.59     34782
weighted avg       0.60      0.59      0.59     34782
 samples avg       0.59      0.59      0.59     34782



## BERT

In [21]:
def predict_topic(text, model, tokenizer, max_length=128):
    """
    Predict topic for a single text input
    
    Args:
        text (str): Input text to classify
        model (BertForSequenceClassification): Trained BERT model
        tokenizer (BertTokenizer): Tokenizer used during training
        max_length (int): Maximum sequence length
    
    Returns:
        Predicted topic label (integer)
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Encode the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1)
    
    return prediction.cpu().item()

In [22]:
bert_predictions = []
for test_text in tqdm(X_test_bert):
        predicted_topic = predict_topic(
            test_text, bert_model, bert_tokenizer
        )
        bert_predictions.append(predicted_topic)

100%|██████████| 1738/1738 [00:23<00:00, 74.90it/s]


In [23]:
print("Classification Report:\n", classification_report(y_test_bert, bert_predictions))

Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.49      0.54       184
           1       0.61      0.66      0.64       193
           2       0.68      0.77      0.72       160
           3       0.50      0.31      0.38       179
           4       0.70      0.75      0.73       158
           5       0.80      0.77      0.79       179
           6       0.55      0.45      0.49       167
           7       0.63      0.62      0.63       186
           8       0.51      0.78      0.62       167
           9       0.69      0.67      0.68       165

    accuracy                           0.63      1738
   macro avg       0.63      0.63      0.62      1738
weighted avg       0.63      0.63      0.62      1738



In [24]:
save_directory = './fine_tuned_bert_model_v2'

# Save the fine-tuned model and tokenizer
bert_model.save_pretrained(save_directory)
bert_tokenizer.save_pretrained(save_directory)

('./fine_tuned_bert_model_v2/tokenizer_config.json',
 './fine_tuned_bert_model_v2/special_tokens_map.json',
 './fine_tuned_bert_model_v2/vocab.txt',
 './fine_tuned_bert_model_v2/added_tokens.json')