In [1]:
!pip install transformers datasets



In [2]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import save_model
import json

In [3]:
data = pd.read_csv('/content/SPAM text message 20170820 - Data.csv', usecols=['Message', 'Category'])

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
train_data, test_data = train_test_split(data)

In [6]:
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
le = LabelEncoder()
train_data['Category'] = le.fit_transform(train_data['Category'])
test_data['Category'] = le.fit_transform(test_data['Category'])

In [8]:
train_data.head()

Unnamed: 0,Category,Message
1901,0,And miss vday the parachute and double coins??...
1230,0,Jus ans me lar. U'll noe later.
4274,0,Kind of. Just missed train cos of asthma attac...
3069,0,Boy you best get yo ass out here quick
1216,0,that would be good … I'll phone you tomo lunch...


In [9]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
DATASET = DatasetDict()
DATASET['train'] = train_dataset
DATASET['test'] = test_dataset

In [10]:
DATASET

DatasetDict({
    train: Dataset({
        features: ['Category', 'Message', '__index_level_0__'],
        num_rows: 4179
    })
    test: Dataset({
        features: ['Category', 'Message', '__index_level_0__'],
        num_rows: 1393
    })
})

In [11]:
def tokenize_function(data):
    return tokenizer(data['Message'], truncation=True)

In [12]:
tokenized_dataset = DATASET.map(tokenize_function, batched=True)

tf_train_dataset = tokenized_dataset['train'].to_tf_dataset(
columns=['input_ids', 'token_type_ids', 'attention_mask'],
label_cols=['Category'],
collate_fn=collator,
batch_size=8,
shuffle=True
)

tf_test_dataset = tokenized_dataset['test'].to_tf_dataset(
columns=['input_ids', 'token_type_ids', 'attention_mask'],
label_cols=['Category'],
collate_fn=collator,
batch_size=8,
shuffle=False
)

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [13]:
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps)
opt = Adam(learning_rate=lr_scheduler)
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [14]:
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [15]:
model.fit(tf_train_dataset, validation_data=tf_test_dataset, epochs=num_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff110318b50>

In [17]:
preds = model.predict(tf_test_dataset)['logits']
class_preds = np.argmax(preds, axis=1)
report = classification_report(test_data['Category'], class_preds)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1228
           1       0.98      0.96      0.97       165

    accuracy                           0.99      1393
   macro avg       0.99      0.98      0.98      1393
weighted avg       0.99      0.99      0.99      1393

