In [3]:
!pip install datasets



In [4]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=3)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [5]:
import os
print(os.listdir("/kaggle/input/balanced-labeled-comments-int2-csv"))
import pandas as pd

dframe = pd.read_csv("/kaggle/input/balanced-labeled-comments-int2-csv/balanced_labeled_comments_dataset_int.csv")

# dframe['Classification'] = dframe['Classification'].map({"Other": 0, "Question": 1})

# Split the data
train_df, temp_df = train_test_split(dframe, test_size=0.1, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Create DatasetDict
categories = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

categories

['balanced_labeled_comments_dataset_int.csv']


DatasetDict({
    train: Dataset({
        features: ['Comments', 'Classification'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['Comments', 'Classification'],
        num_rows: 500
    })
    test: Dataset({
        features: ['Comments', 'Classification'],
        num_rows: 500
    })
})

In [6]:
# Tokenize the datasets
def tokenize(batch):
    return tokenizer(batch["Comments"], padding=True, truncation=True, max_length=512)

categories_encoded = categories.map(tokenize, batched=True, batch_size=None)
BATCH_SIZE = 16

def prepare_tf_dataset(dataset):
    # Convert to tensorflow dataset
    tf_dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': dataset['input_ids'],
        'attention_mask': dataset['attention_mask']
    }, dataset['Classification']))

    return (tf_dataset.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE))

train_dataset = prepare_tf_dataset(categories_encoded['train'])
val_dataset = prepare_tf_dataset(categories_encoded['validation'])
test_dataset = prepare_tf_dataset(categories_encoded['test'])

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_dataset,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test accuracy: {test_accuracy:.4f}")

# Save the model in TensorFlow SavedModel format
!mkdir "/kaggle/working/bert_model_savedmodel2"
tf.saved_model.save(model, "/kaggle/working/bert_model_savedquestionmodel")

Test accuracy: 0.9640


In [9]:
import tensorflow as tf
from transformers import AutoTokenizer
import numpy as np

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
loaded_model = tf.saved_model.load("/kaggle/working/bert_model_savedquestionmodel")

# Model inference function
infer = loaded_model.signatures["serving_default"]

# Comments for prediction
comments_list = [
    'misleading title click bait',
    'kindly post video about claude 35 sonnet finetune and api full course video',
    'superb excellent video keep it up',
    'i liked the font you use in the thumbnail is it okay to share its name?',
    'are these numbers even legit or just an exaggerated estimation?'
]

# Tokenize input comments
inputs = tokenizer(comments_list, padding=True, truncation=True, max_length=512, return_tensors="tf")

# Run model inference
outputs = infer(**inputs)
logits = outputs['logits'].numpy()

# Define label mapping (0: Other, 1: Suggestion)
label_mapping = {0: "Other", 1: "Question"}

# Convert logits to labels
predictions = np.argmax(logits, axis=1)  

# Display results
for index, pred in enumerate(predictions):
    print(comments_list[index], ":", label_mapping[pred])

misleading title click bait : Other
kindly post video about claude 35 sonnet finetune and api full course video : Other
superb excellent video keep it up : Other
i liked the font you use in the thumbnail is it okay to share its name? : Question
are these numbers even legit or just an exaggerated estimation? : Question


In [14]:
from IPython.display import FileLink
FileLink('/kaggle/working/bert_model_savedmodel2.pkl')  