In [1]:
import string
import numpy as np
import gdown
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
CATEGORY = ['HAM', 'SPAM']
DATASET_PATH = './text_message.csv'

VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

In [4]:
gdown.download(id="1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R", output="text_message.csv")

Downloading...
From: https://drive.google.com/uc?id=1N7rk-kfnDFIGMeX0ROVTjKh71gcgx-7R
To: C:\Users\nguye\OneDrive\Learning\AIO\AIO2024\Module 2\M02MC\070824 Wednesday_Text_Classification\text_message.csv
100%|███████████████████████████████████████████████████████████████████████████████| 486k/486k [00:00<00:00, 2.71MB/s]


'text_message.csv'

In [5]:
df = pd.read_csv(DATASET_PATH)

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [7]:
def preprocessing_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    stop_words = nltk.corpus.stopwords.words('english')
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [8]:
messages = [preprocessing_text(message) for message in messages]

In [9]:
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)
    return dictionary

In [10]:
dictionary = create_dictionary(messages)

In [11]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

In [12]:
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [18]:
le = LabelEncoder()
y = le.fit_transform(labels)

In [19]:
le.classes_, y

(array(['ham', 'spam'], dtype='<U4'),
 array([0, 0, 1, ..., 0, 0, 0], dtype=int64))

In [20]:
X.shape, y.shape

((5572, 8166), (5572,))

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = VAL_SIZE,
                                                shuffle=True,
                                                  random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size = TEST_SIZE,
                                                    shuffle=True,
                                                    random_state=SEED)

In [22]:
model = GaussianNB()
model = model.fit(X_train, y_train)

In [24]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

val_accuracy, test_accuracy

(0.8816143497757848, 0.8602150537634409)

In [27]:
def predict(text, model, dictionary):
    processed_text = preprocessing_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

In [30]:
test_input = 'I am accuracy thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
prediction_cls

'ham'