## Add library

In [1]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Read data

In [3]:
DATASET_PATH = './data/2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

In [4]:
display(df)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Preprocessing

In [5]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [8]:
def lower_case(text):
  return text.lower()

def puntuation_removal(text):
  translator = str.maketrans('', '', string.punctuation)

  return text.translate(translator)

def tokenizer(text):
  return nltk.word_tokenize(text)

def remove_stopwords(tokens):
  stop_words = nltk.corpus.stopwords.words('english')

  return [token for token in tokens if token not in stop_words]

def stemming(tokens):
  stemmer = nltk.PorterStemmer()

  return [stemmer.stem(token) for token in tokens]

In [6]:
def preprocess_text(text):
  text = lower_case(text)
  text = puntuation_removal(text)
  tokens = tokenizer(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)

  return tokens

def preprocess_text(text):
  text = lower_case(text)
  text = puntuation_removal(text)
  tokens = tokenizer(text)
  tokens = remove_stopwords(tokens)
  tokens = stemming(tokens)

  return tokens

In [9]:
messages = [preprocess_text(message) for message in messages]

## Extract features

In [10]:
def create_dictionary(messages):
  dictionary = []
  for tokens in messages:
    for token in tokens:
      if token not in dictionary:
        dictionary.append(token)

  return dictionary

In [11]:
dictionary = create_dictionary(messages)

In [12]:
def create_features(tokens, dictionary):
  features = np.zeros(len(dictionary))

  for token in tokens:
    if token in dictionary:
      features[dictionary.index(token)] += 1

  return features

In [13]:
X = np.array([create_features(tokens, dictionary) for tokens in messages])

## Label encoder

In [14]:
le = LabelEncoder()
y = le.fit_transform(labels)

## Split data

In [15]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                   test_size=VAL_SIZE,
                                                   shuffle=True,
                                                   random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=TEST_SIZE,
                                                    shuffle=True,
                                                    random_state=SEED)

## Train

In [16]:
model = GaussianNB()
model.fit(X_train, y_train)

## Evaluate

In [17]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Validation Accuracy: 0.8816143497757848
Test Accuracy: 0.8602150537634409


## Predict

In [18]:
def predict(text, model, dictionary):
  processed_text = preprocess_text(text)
  features = create_features(processed_text, dictionary)
  features = np.array(features).reshape(1, -1)
  prediction = model.predict(features)
  prediction_cls = le.inverse_transform(prediction)[0]

  return prediction_cls

In [19]:
test_input = 'i am thinking the way to study AI which is the best for me'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham


In [20]:
test_input = 'this is the voucher for you to buy ticket for tomorrow concert'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: spam
