# Using distilbert model 

## Import data

In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../raw_data/raw_train_data.csv', nrows=180000)

In [None]:
import string
import re

def clean_text(text):
    #No whitespaces in beginning or end
    text = text.strip()
    #lowercase
    text= text.lower()
    #remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

    return text

(3600000, 2)

In [None]:
df['clean_text'] = df['text'].apply(clean_text)

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny", padding_side = "right")
tokenizer("My tokenizers and model must match")

  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': [101, 2026, 19204, 17629, 2015, 1998, 2944, 2442, 2674, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
df["text"]

first_sentence_tokenized = tokenizer(df["text"][0])

first_sentence_tokenized

{'input_ids': [101, 2026, 19204, 17629, 2015, 1998, 2944, 2442, 2674, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
from transformers import TFAutoModel
model = TFAutoModel.from_pretrained("prajjwal1/bert-tiny", from_pt = True)

In [None]:
tokenized_tensors = tokenizer(df["clean_text"].tolist(), max_length=400, padding = "max_length", truncation = True, return_tensors="tf")

Unnamed: 0,label,text,clean_text
1581982,1,those looking for entertainment need not bothe...,those looking for entertainment need not bothe...
1568876,2,double trouble: this was a funny book. this is...,double trouble this was a funny book this is a...
3118450,1,Cracktastic Product: I replaced my old filter ...,cracktastic product i replaced my old filter h...
3131087,1,Not My Cup of Tea: I'd not realized that this ...,not my cup of tea id not realized that this wa...
1820244,1,I was a bit disapointed.: After reading so man...,i was a bit disapointed after reading so many ...


In [None]:
tokenized_tensors["input_ids"].shape

In [None]:
from transformers import TFAutoModelForSequenceClassification

tuning_model = TFAutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", from_pt = True)

2025-06-11 16:18:14.464171: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
df['label'] = df['label'] -1

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
X_train = tokenized_tensors["input_ids"]
y_train = df["label"]

<tf.Tensor: shape=(3600, 400), dtype=int32, numpy=
array([[  101,  2216,  2559, ...,     0,     0,     0],
       [  101,  3313,  4390, ...,     0,     0,     0],
       [  101,  8579, 10230, ...,     0,     0,     0],
       ...,
       [  101,  6429, 12124, ...,     0,     0,     0],
       [  101,  4299,  2002, ...,     0,     0,     0],
       [  101,  9364,  1045, ...,     0,     0,     0]], dtype=int32)>

In [None]:
tuning_model.compile(optimizer= "adam", metrics= "accuracy")
tuning_model.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=5)

## Saving the model and tokenizer

In [None]:
#This way the model can be imported with the .from_pretrained again, this time using the local path
tuning_model.save_pretrained("../model/bert_tiny_180k")
tokenizer.save_pretrained("../model/tokenizer_bert_tiny_180k")

## Load the model with tensorflow

In [1]:
import tensorflow as tf

2025-06-13 08:54:31.947294: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from transformers import TFAutoModelForSequenceClassification

#Load the saved model from your local path
model = TFAutoModelForSequenceClassification.from_pretrained("/Users/johannesb/code/Jojo2813/SentiScope/model/bert_tiny_180k (1)")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /Users/johannesb/code/Jojo2813/SentiScope/model/bert_tiny_180k (1).
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
#Load the saved tokenizer from local path
tokenizer = AutoTokenizer.from_pretrained("/Users/johannesb/code/Jojo2813/SentiScope/model/tokenizer_bert_tiny_180k")

In [None]:
#Loading 50k test observations
X_test = pd.read_csv("/Users/johannesb/code/Jojo2813/SentiScope/raw_data/test_df_ml_clean.csv", nrows= 50000)

In [None]:
#Tokenize with tokenizer
X_test_tokens = tokenizer(X_test['clean_text'].to_list(), max_length=400, padding = "max_length", truncation = True, return_tensors="tf")

In [None]:
#Let model predict
prediction = model.predict(X_test_tokens)



In [None]:
#Get the actual label out of the predictions
logits = prediction.logits
predicted_classes = tf.argmax(logits, axis=1).numpy()

y_pred = list(predicted_classes)

In [46]:
from sklearn.metrics import accuracy_score

In [None]:
#Model gets around 90% accuracy on the 50k test rows
accuracy_score(X_test['label'], y_pred)

0.89364