In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from keras.models import Model

import transformers
from transformers import BertTokenizer, TFBertModel

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding="ISO-8859-1")

In [None]:
df.head()

In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3' ,'Unnamed: 4'], axis=1,inplace=True)

In [None]:
df.rename(columns={'v1': 'Class', 'v2': 'Text'}, inplace=True)

In [None]:
df['Class'] = df['Class'].map({'ham':0, 'spam':1})
df.head()

In [None]:
# Now we can see this is imbalanced target 
df['Class'].value_counts()

In [None]:
sns.set(style = "darkgrid" , font_scale = 1.2)
sns.countplot(df.Class).set_title("Number of ham and spam messages")
plt.show()

In [None]:
df['length'] = df.Text.apply(len)
df.head()

Now lets see the SMS length and target relationship

In [None]:
_, ax = plt.subplots(figsize=(10, 4))
sns.kdeplot(df.loc[df.Class == 0, "length"], shade=True, label="Ham", clip=(-50, 250),)
sns.kdeplot(df.loc[df.Class == 1, "length"], shade=True, label="Spam")
ax.set(
    xlabel="Length",
    ylabel="Density",
    title="Length of messages.",
)
ax.legend(loc="upper right")
plt.show()

Text preprocessing and cleaning 

In [None]:
stop_words = stopwords.words('english')
print(stop_words[::10])

porter = PorterStemmer()

Functions for data cleaning

In [None]:
def clean_text(words):
    """The function to clean text"""
    words = re.sub("[^a-zA-Z]"," ", words)
    text = words.lower().split()                   
    return " ".join(text)

def remove_stopwords(text):
    """The function to removing stopwords"""
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(text)

def stemmer(stem_text):
    """The function to apply stemming"""
    stem_text = [porter.stem(word) for word in stem_text.split()]
    return " ".join(stem_text)

In [None]:
df['Text'] = df['Text'].apply(clean_text)
df['Text'] = df['Text'].apply(remove_stopwords)
df['Text'] = df['Text'].apply(stemmer)

In [None]:
df.head()

In [None]:
X = df['Text']
y = df['Class']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

train, test = train_test_split(df.rename(columns={"Text":"Text", "Class":"labels"}), test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
trainh = Dataset.from_pandas(train)
testh = Dataset.from_pandas(test)
valh = Dataset.from_pandas(val)


ds = DatasetDict()

ds['train'] = trainh
ds['test'] = testh
ds['validation'] = valh

print(ds)

## BERT Base

In [None]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# tokenizer

In [None]:
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')

The function which allows to encode our dataset with BERT tokenizer, maximum sentence length is 64 (maxlen).

In [None]:
# def encode(text, maxlen):
#     input_ids=[]
#     attention_masks=[]

#     for row in text:
#         encoded = tokenizer.encode_plus(
#             row,
#             add_special_tokens=True,
#             max_length=maxlen,
#             pad_to_max_length=True,
#             return_attention_mask=True,
#         )
#         input_ids.append(encoded['input_ids'])
#         attention_masks.append(encoded['attention_mask'])

#     return np.array(input_ids),np.array(attention_masks)

In [None]:
# X_train_input_ids, X_train_attention_masks = encode(X_train.values, maxlen=64)
# X_test_input_ids, X_test_attention_masks = encode(X_test.values, maxlen=64)

In [None]:
# def build_model(bert_model):
#     input_word_ids = tf.keras.Input(shape=(64,),dtype='int32')
#     attention_masks = tf.keras.Input(shape=(64,),dtype='int32')

#     sequence_output = bert_model([input_word_ids,attention_masks])
#     output = sequence_output[1]
#     output = tf.keras.layers.Dense(32,activation='relu')(output)
#     output = tf.keras.layers.Dropout(0.2)(output)
#     output = tf.keras.layers.Dense(1,activation='sigmoid')(output)

#     model = tf.keras.models.Model(inputs = [input_word_ids,attention_masks], outputs = output)
#     model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

#     return model

In [None]:
# model = build_model(bert_model)
# model.summary()

In [None]:
# class_weight = {0: 1, 1: 8}

In [None]:
# history = model.fit(
#     [X_train_input_ids, X_train_attention_masks],
#     y_train,
#     batch_size=32,
#     epochs=5,
#     validation_data=([X_test_input_ids, X_test_attention_masks], y_test),
#     class_weight=class_weight)

In [None]:
# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()

In [None]:
# loss, accuracy = model.evaluate([X_test_input_ids, X_test_attention_masks], y_test)
# print('Test accuracy :', accuracy)

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)

## BERT Large

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)

## BERT Medium

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "bert-large-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-medium", num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)

## BERT Small

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "prajjwal1/bert-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)

## BERT Tiny

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)

## BERT Mini

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

checkpoint = "prajjwal1/bert-mini"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["Text"], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import TrainingArguments
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments("test-trainer")

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
from sklearn.metrics import accuracy_score
x = accuracy_score(preds, y_test)
print(x)