### Importing Required Libraries

In [31]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import gc

### Loading the data

In [2]:
df = pd.read_csv('bbc_data.csv')

In [3]:
df.head(2)

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment


In [35]:
# shape of the data
df.shape

(2225, 3)

In [4]:
# Target CLasses
df['labels'].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [5]:
# ENcoding the Target Variable
df['encoded_labels'] = df['labels'].astype('category').cat.codes
df.head()

Unnamed: 0,data,labels,encoded_labels
0,Musicians to tackle US red tape Musicians gro...,entertainment,1
1,"U2s desire to be number one U2, who have won ...",entertainment,1
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment,1
3,Snicket tops US box office chart The film ada...,entertainment,1
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment,1


In [6]:
# Value_counts of each class 
df.groupby(['labels', 'encoded_labels']).size()

labels         encoded_labels
business       0                 510
entertainment  1                 386
politics       2                 417
sport          3                 511
tech           4                 401
dtype: int64

In [7]:
# Converting the text column to lists
data_texts = df["data"].to_list()
# data_texts = df["data"]

# Converting the labels column to lists
data_labels = df["encoded_labels"].to_list()

In [8]:
#Train and Validation data split
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)

#Train and Test data split
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

In [9]:
# first 2 rows as list
train_texts[:2]

['Halo fans hope for sequel  Xbox video game Halo 2 has been released in the US on 9 November, with a UK release two days later. Why is the game among the most anticipated of all time?  Halo is considered by many video game pundits to be one of the finest examples of interactive entertainment ever produced and more than 1.5 million people worldwide have pre-ordered the sequel. A science fiction epic, Halo centred the action on a human cyborg, controlled by the player, who had to save his crew from an alien horde after a crash landing on a strange and exotic world contained on the interior surface of a giant ring in space. Remembrance of Things Past it was not - but as a slice of schlock science fiction inspired by works such as Larry Nivens Ringworld and the film Starship Troopers, it fit the bill perfectly. Halo stood out from a crowd of similar titles - it was graphically impressive, had tremendous audio, using Dolby Digital, a decent storyline, instant playability and impressive phy

In [10]:
# encoded labels of first 2 rows
train_labels[:2]

[4, 0]

In [11]:
# Initializing the tokenizer from pretrained distilBert 
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# fit-transform the train data to the tokeninzer with truncation and padding
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# fit-transform the validation data to the tokeninzer with truncation and padding
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [12]:
# Encoded ids of the first 2 rows
train_encodings['input_ids'][:2]

[[101,
  17201,
  4599,
  3246,
  2005,
  8297,
  12202,
  2678,
  2208,
  17201,
  1016,
  2038,
  2042,
  2207,
  1999,
  1996,
  2149,
  2006,
  1023,
  2281,
  1010,
  2007,
  1037,
  2866,
  2713,
  2048,
  2420,
  2101,
  1012,
  2339,
  2003,
  1996,
  2208,
  2426,
  1996,
  2087,
  11436,
  1997,
  2035,
  2051,
  1029,
  17201,
  2003,
  2641,
  2011,
  2116,
  2678,
  2208,
  26136,
  23194,
  2015,
  2000,
  2022,
  2028,
  1997,
  1996,
  10418,
  4973,
  1997,
  9123,
  4024,
  2412,
  2550,
  1998,
  2062,
  2084,
  1015,
  1012,
  1019,
  2454,
  2111,
  4969,
  2031,
  3653,
  1011,
  3641,
  1996,
  8297,
  1012,
  1037,
  2671,
  4349,
  8680,
  1010,
  17201,
  16441,
  1996,
  2895,
  2006,
  1037,
  2529,
  22330,
  11755,
  1010,
  4758,
  2011,
  1996,
  2447,
  1010,
  2040,
  2018,
  2000,
  3828,
  2010,
  3626,
  2013,
  2019,
  7344,
  21038,
  2044,
  1037,
  5823,
  4899,
  2006,
  1037,
  4326,
  1998,
  12564,
  2088,
  4838,
  2006,
  1996,
  4592,
  3

In [13]:
# len of encoding is equal to the max input size of the model
len(train_encodings['input_ids'][0])

512

In [14]:
# len of encoding is equal to the max input size of the model
len(val_encodings['input_ids'][0])

512

In [15]:
# Converting the train encodings into TF dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Converting the validation encodings into TF dataset
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [16]:
train_dataset

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [17]:
val_dataset

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

### Fine Tuning without any regularization or dropout

In [18]:
# Instantiating the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# setting up the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# setting up the loss_function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# compiling the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [19]:
# fine tuning the model with a batch size of 16 and buffer size of 1000
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3,
          validation_data=val_dataset.shuffle(1000).batch(16))

Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7824a5cf7b80>

In [20]:
# Model Evaluation on Test set
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

from sklearn.metrics import accuracy_score

# Making predictions on the test dataset
predictions = model.predict(test_dataset.batch(16))

# Converting logits to probabilities and then to predicted labels
predicted_labels = tf.argmax(predictions.logits, axis=1)

# Calculating accuracy 
accuracy = accuracy_score(test_labels, predicted_labels.numpy())

print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 1.0000


Model is overfitting

### Fine Tuning with dropout and less number of epochs

In [32]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

#Adding a Dropout layer (20% dropout rate)
dropout_rate = 0.2
model.distilbert.transformer.dropout = tf.keras.layers.Dropout(dropout_rate)

# setting up the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# setting up the loss_function
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

#compiling the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [33]:
# Fine Tuning the model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=1,
          validation_data=val_dataset.shuffle(1000).batch(16))



<tf_keras.src.callbacks.History at 0x782475edc280>

In [34]:
#Evaluating on the Test set
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

from sklearn.metrics import accuracy_score

# Making predictions on the test dataset
predictions = model.predict(test_dataset.batch(16))
# predictions = model.predict(test_dataset)

# Converting logits to probabilities and then to predicted labels
predicted_labels = tf.argmax(predictions.logits, axis=1)

# Calculating accuracy
accuracy = accuracy_score(test_labels, predicted_labels.numpy())

print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 0.9444


Model has generalized well on the trainset

In [24]:
# Saving the model and tokenizer
save_directory = "/saved_models" 

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/saved_models/tokenizer_config.json',
 '/saved_models/special_tokens_map.json',
 '/saved_models/vocab.txt',
 '/saved_models/added_tokens.json')

In [25]:
# Loading the model and tokenizer
loaded_tokenizer = DistilBertTokenizer.from_pretrained(save_directory)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

Some layers from the model checkpoint at /saved_models were not used when initializing TFDistilBertForSequenceClassification: ['dropout_39']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /saved_models and are newly initialized: ['dropout_60']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
test_text = test_texts[13]
test_text

'Hodgson relishes European clashes  Former Blackburn boss Roy Hodgson says the Premiership should follow the rest of Europe and have a winter break - but insists that a gruelling domestic schedule will not damage the English elites bid for Champions League glory.  Hodgson - now in charge at Viking Stavanger - was at Liverpools clash with Bayer Leverkusen at Anfield on Tuesday as a member of Uefas technical committee. Hodgson is a fierce advocate of the winter break employed throughout Europe, although not in England - where the Champions League contenders have ploughed through a heavy fixture list. But Hodgson told BBC Sport that while he believes the Premiership should embrace the idea, he does not expect it to cost the English representatives in the last 16 of the Champions League. "I just feel it is very difficult to say with certainty that teams who have had the break will have a definite edge. "I am a fervent supporter of the break. It gives players the chance to recharge their ba

In [27]:
# Testing on a single text
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]
prediction_value

3

In [28]:
df.groupby(['labels', 'encoded_labels']).size()

labels         encoded_labels
business       0                 510
entertainment  1                 386
politics       2                 417
sport          3                 511
tech           4                 401
dtype: int64