In [None]:
!pip install datasets

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoConfig
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split


# Load the pre-trained DistilBERT model and tokenizer 
# Bidirectional Encoder Representations from Transformers
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels=3)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, config=config)

In [None]:
# Load and preprocess the data
dframe = pd.read_csv("/kaggle/input/balanced-labeled-comments-dataset-with-int64-qso/balanced_labeled_comments_dataset_with_int64.csv")
# dframe['Classification'] = dframe['Classification'].map({"Other": 0, "Question": 1, "Suggestion": 2})

# Split the data
train_df, temp_df = train_test_split(dframe, test_size=0.1, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Create DatasetDict
categories = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

categories

In [None]:
# Tokenize the datasets
def tokenize(batch):
    return tokenizer(batch["Comments"], padding=True, truncation=True, max_length=512)

categories_encoded = categories.map(tokenize, batched=True, batch_size=None)

# Prepare the datasets for training
BATCH_SIZE = 16

def prepare_tf_dataset(dataset):
    # Convert to tensorflow dataset
    tf_dataset = tf.data.Dataset.from_tensor_slices(({
        'input_ids': dataset['input_ids'],
        'attention_mask': dataset['attention_mask']
    }, dataset['Classification']))

    return (tf_dataset.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE))

train_dataset = prepare_tf_dataset(categories_encoded['train'])
val_dataset = prepare_tf_dataset(categories_encoded['validation'])
test_dataset = prepare_tf_dataset(categories_encoded['test'])


In [None]:
# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    train_dataset,
    epochs=5
)

In [None]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test accuracy: {test_accuracy:.4f}")

# Save the model in TensorFlow SavedModel format
!mkdir "/kaggle/working/bert_model_savedmodel"
tf.saved_model.save(model, "/kaggle/working/bert_model_savedmodel")

In [None]:
#How to use the Fined Tuned Model
import tensorflow as tf
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
loaded_model = tf.saved_model.load("/kaggle/input/distrilbertqso_v3-keras-default-v1/keras/default/1")

# To use it for predictions, you typically call its signatures:
infer = loaded_model.signatures["serving_default"]

comments_list = ['misleading title click bait',
                 'kindly post video about claude 35 sonnet finetune and api full course video',
                 'superb excellent vidoe keep it up',
                 'i liked the font you use in the thumbnail is it okay to share its name?',
                 'are these number even legit or just an exaggerated estimation?']

inputs = tokenizer(comments_list, padding=True, truncation=True, max_length=512, return_tensors="tf")
outputs = infer(**inputs)
logits = outputs['logits'].numpy()
logits_with_labels = [list(zip(*sorted(zip(logit, ["Other", "Question", "Suggestion"]), reverse=True))) for logit in logits]
results = [ {"labels": labels, "score":logits} for logits, labels in logits_with_labels]

for index, result in enumerate(results):
    print(comments_list[index], ":" ,result['labels'][0])
