In [1]:
import google.colab
google.colab.drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/drive/My Drive/classify_emails/emails.csv',sep=';' )
df

Unnamed: 0,Spam,Message
0,0,Please call me at 8
1,1,Free money is available for you
2,0,I study he studies they are students I studied...
3,1,I am working at office now to 9 evening
4,0,U dun say so early hor... U c already then say...
...,...,...
5222,0,"It‘s reassuring, in this crazy world."
5223,0,Oh... Okie lor...We go on sat...
5224,1,You are awarded a SiPix Digital Camera! call 0...
5225,0,"Hey chief, can you give me a bell when you get..."


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
def preprocess_text(text):

   tokens = word_tokenize(text.lower())
   tokens = [token for token in tokens if token not in string.punctuation]

   stop_words = stopwords.words('english')
   tokens = [token for token in tokens if token not in stop_words]

   stemmer = PorterStemmer()
   tokens = [stemmer.stem(token) for token in tokens]

   preprocessed_text = ' '.join(tokens)
   preprocessed_text = re.sub(r'\d+', '', preprocessed_text)
   preprocessed_text = re.sub(r'http\s+|www\s+', '', preprocessed_text)

   return preprocessed_text

In [5]:
df['processed_Message'] = df['Message'].apply(preprocess_text)
df

Unnamed: 0,Spam,Message,processed_Message
0,0,Please call me at 8,pleas call
1,1,Free money is available for you,free money avail
2,0,I study he studies they are students I studied...,studi studi student studi yesterday
3,1,I am working at office now to 9 evening,work offic even
4,0,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
...,...,...,...
5222,0,"It‘s reassuring, in this crazy world.",‘ reassur crazi world
5223,0,Oh... Okie lor...We go on sat...,oh ... oki lor ... go sat ...
5224,1,You are awarded a SiPix Digital Camera! call 0...,award sipix digit camera call landlin deliver...
5225,0,"Hey chief, can you give me a bell when you get...",hey chief give bell get need talk royal visit ...


In [6]:
df[df['Spam']==1].count()

Unnamed: 0,0
Spam,675
Message,675
processed_Message,675


In [7]:
df[df['Spam']==0].count()

Unnamed: 0,0
Spam,4552
Message,4552
processed_Message,4552


In [8]:
spam_df = df[df['Spam']==1]
not_spam_df = df[df['Spam']==0]

max_count = len(not_spam_df)

resampled_spam_df = spam_df.sample(n=max_count, random_state=42, replace=True)

balanced_df = pd.concat([not_spam_df, resampled_spam_df])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
messages = balanced_df['processed_Message']
labels = balanced_df['Spam']

len(messages), len(labels)

(9104, 9104)

In [10]:
model_name = 'bert-base-uncased'

In [11]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [12]:
text = "Don't like it!"
tokens = bert_tokenizer.tokenize(text)
print(tokens)

['don', "'", 't', 'like', 'it', '!']


In [13]:
max_len = 10

bert_inputs= bert_tokenizer.encode_plus(text, add_special_tokens=True,
                                         max_length=max_len, pad_to_max_length=True,
                                         return_attention_mask=True,
                                         truncation=True)

input_ids = bert_inputs['input_ids']
token_type_ids = bert_inputs['token_type_ids']
attention_mask = bert_inputs['attention_mask']

tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)

print('input_ids: ', input_ids)
print('token_type_ids: ', token_type_ids)
print('attention_mask: ', attention_mask)
print('tokens: ', tokens)

input_ids:  [101, 2123, 1005, 1056, 2066, 2009, 999, 102, 0, 0]
token_type_ids:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
attention_mask:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
tokens:  ['[CLS]', 'don', "'", 't', 'like', 'it', '!', '[SEP]', '[PAD]', '[PAD]']




In [14]:
from keras.preprocessing.sequence import pad_sequences

input_ids = pad_sequences([input_ids], maxlen=max_len, padding="post")
token_type_ids = pad_sequences([token_type_ids], maxlen=max_len, padding="post")
attention_mask = pad_sequences([attention_mask], maxlen=max_len, padding="post")

In [15]:
print("Tokens: ", bert_tokenizer.decode(input_ids[0]))
print("Tokens IDs: ", input_ids[0])
print("Tokens Types IDs: ", token_type_ids)
print("Attention Mask: ", attention_mask)

Tokens:  [CLS] don't like it! [SEP] [PAD] [PAD]
Tokens IDs:  [ 101 2123 1005 1056 2066 2009  999  102    0    0]
Tokens Types IDs:  [[0 0 0 0 0 0 0 0 0 0]]
Attention Mask:  [[1 1 1 1 1 1 1 1 0 0]]


In [16]:
max_len = 32

input_ids = []
attention_mask = []

for text in messages:
    bert_inputs = bert_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        truncation=True,
        return_attention_mask=True
    )

    # Append inside the loop
    input_ids.append(bert_inputs['input_ids'])
    attention_mask.append(bert_inputs['attention_mask'])

In [17]:
messages[11]

'how pain dear r u smile'

In [18]:
input_ids[11]

[101, 2129, 3255, 6203, 1054, 1057, 2868, 102]

In [19]:
attention_mask[11]

[1, 1, 1, 1, 1, 1, 1, 1]

In [20]:
from keras.preprocessing.sequence import pad_sequences

input_ids = pad_sequences(input_ids, maxlen=max_len, padding="post")
attention_mask = pad_sequences(attention_mask, maxlen=max_len, padding="post")

In [21]:
input_ids[11]

array([ 101, 2129, 3255, 6203, 1054, 1057, 2868,  102,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

In [22]:
input_ids = np.array(input_ids)
attention_mask = np.array(attention_mask)
labels = np.array(labels)

In [23]:
len(input_ids), len(attention_mask), len(labels)

(9104, 9104, 9104)

In [24]:
from transformers import TFBertForSequenceClassification
num_classes = 2

# model_name is identified before (model_name = 'bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained(
    model_name, num_labels=num_classes)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
import tensorflow as tf
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

bert_model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
from sklearn.model_selection import train_test_split
train_input, val_input, train_mask, val_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2)

In [28]:
epochs = 4
batch_size = 32
history = bert_model.fit(
    [train_input, train_mask],
    train_labels,
    validation_data=([val_input, val_mask], val_labels),
    epochs=epochs,
    batch_size=batch_size
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [29]:
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
print("Train Accuracy:", round(100*train_accuracy[-1],2))
print("Validation Accuracy:", round(100*val_accuracy[-1],2))

Train Accuracy: 99.99
Validation Accuracy: 99.78


In [30]:
model_save_path='/content/drive/My Drive/classify_emails/bert_model.h5'
bert_model.save_weights(model_save_path)

In [31]:
model_save_path='/content/drive/My Drive/classify_emails/bert_model.keras'
bert_model.save_weights(model_save_path)

In [32]:
trained_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
trained_model.load_weights(model_save_path)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
from sklearn.metrics import classification_report
target_names=['Not Spam','Spam']
preds = trained_model.predict([val_input,val_mask], batch_size=32)
pred_labels = preds.logits.argmax(axis=1)



In [36]:
print('Classification Report')
print(classification_report(val_labels, pred_labels, target_names=target_names))

Classification Report
              precision    recall  f1-score   support

    Not Spam       1.00      1.00      1.00       920
        Spam       1.00      1.00      1.00       901

    accuracy                           1.00      1821
   macro avg       1.00      1.00      1.00      1821
weighted avg       1.00      1.00      1.00      1821



In [40]:
text="free money"
input_ids_sent=[]
attention_masks_sent=[]

bert_inp=bert_tokenizer.encode_plus(text,add_special_tokens = True,
                                    max_length =max_len,
                                    truncation=True,
                                    return_attention_mask = True)
input_ids_sent.append(bert_inp['input_ids'])
attention_masks_sent.append(bert_inp['attention_mask'])
input_ids_sent = pad_sequences(input_ids_sent, maxlen=max_len, padding='post')
attention_masks_sent = pad_sequences(attention_masks_sent, maxlen=max_len, padding='post')
input_ids_sent=np.array(input_ids_sent)
attention_masks_sent=np.array(attention_masks_sent)

In [41]:
predictions =trained_model.predict([input_ids_sent,attention_masks_sent])
print(predictions)

TFSequenceClassifierOutput(loss=None, logits=array([[-3.2202559,  2.9789762]], dtype=float32), hidden_states=None, attentions=None)


In [42]:
c=np.argmax(predictions.logits[0])
if c==0:
    print("The text is predicted to be of class: Not Spam")
else:
    print("The text is predicted to be of class: Spam")

The text is predicted to be of class: Spam


In [50]:
text = "don't be late we have an important meeting tomorrow"

inputs_ids = []
attention_mask = []

bert_inp = bert_tokenizer.encode_plus(text, add_special_tokens=True,
                                      max_length=max_len,
                                      truncation=True,
                                      return_attention_mask=True)
inputs_ids.append(bert_inp['input_ids'])
attention_mask.append(bert_inp['attention_mask'])

inputs_ids = pad_sequences(inputs_ids, maxlen=max_len, padding='post')
attention_mask = pad_sequences(attention_mask, maxlen=max_len, padding='post')

inputs_ids = np.array(inputs_ids)
attention_mask = np.array(attention_mask)

predictions = trained_model.predict([inputs_ids, attention_mask])
print(predictions)

TFSequenceClassifierOutput(loss=None, logits=array([[ 3.7890096, -3.8205094]], dtype=float32), hidden_states=None, attentions=None)


In [51]:
predict = np.argmax(predictions.logits[0])
if predict==0:
    print("The text is predicted to be of class: Not Spam")
else:
    print("The text is predicted to be of class: Spam")


The text is predicted to be of class: Not Spam
