### https://huggingface.co/google-bert/bert-base-uncased

In [1]:
# pip install transformers

>

# **Step 1:** Download the Pretrained BERT Model

In [2]:
from transformers import BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = BertModel.from_pretrained("bert-base-uncased")

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

>

# **Step 2:** To Encode **Sentences**

In [5]:
text = "Replace me by any text you'd like."

In [6]:
encoded_input = tokenizer.encode(text, return_tensors='pt')

In [7]:
encoded_input # 101 is SOS token & 102 is EOS token (starting & ending of the sentense)

tensor([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,  102]])

In [8]:
import torch

with torch.no_grad():
    output = model(encoded_input)
emebedding = output.last_hidden_state.mean(dim=1) # 12 rows, 768 columns --> 768 columns

In [9]:
emebedding.shape

torch.Size([1, 768])

>

# **Step 3:** create Spam Classifier using BERT encoding

### import dataset

In [10]:
import pandas as pd
df = pd.read_csv("spam.csv", encoding="ISO-8859-1")
df = df.rename(columns={"v2": "email", "v1": "target"})
df = df[["email", "target"]]
df.head()

Unnamed: 0,email,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


### data cleaning

In [11]:
import re
def clean(str_):
  str_ = str_.lower()
  str_ = re.sub('[^a-z0-9]+', " ", str_)
  return str_

In [12]:
df["cleaned email"] = df["email"].apply(clean)
df.head()

Unnamed: 0,email,target,cleaned email
0,"Go until jurong point, crazy.. Available only ...",ham,go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry in 2 a wkly comp to win fa cup fina...
3,U dun say so early hor... U c already then say...,ham,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah i don t think he goes to usf he lives arou...


In [13]:
sentences = df["cleaned email"].tolist()

In [None]:
import torch

def get_embendding(sentences):
  # encoded_input = tokenizer.encode(sentences, return_tensors='pt', padding=True, truncation=True)
  input = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=128)

  with torch.no_grad():
    # output = model(encoded_input)
    output = model(**input)

  embendding = output.last_hidden_state.mean(dim=1)
  return embendding

In [15]:
x = get_embendding(sentences[:500])

In [16]:
# this conversion will be required when using BERT embendding within classical ML approach
x1 = x.cpu().detach().numpy() # tensor to numpy formate

In [17]:
y = df["target"].iloc[:500]

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x1, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression

spam_cls = LogisticRegression()

In [20]:
spam_cls.fit(x_train, y_train)

In [21]:
y_pred = spam_cls.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)*100

98.0