In [18]:
import pandas as pd

# Specify the file path of the TSV file
file_path = '/home/dotronghiep/Documents/Uni/Year3_Term2/NLP/smsspamcollection.tsv'

# Read the TSV file into a pandas DataFrame
data = pd.read_csv(file_path, delimiter='\t').dropna()

data.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [19]:
X_raw = data.iloc[:,1].values
y_raw = data.iloc[:,0].values
X_raw, y_raw

(array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
        'Ok lar... Joking wif u oni...',
        "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
        ..., 'Pity, * was in mood for that. So...any other suggestions?',
        "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
        'Rofl. Its true to its name'], dtype=object),
 array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object))

In [96]:
import re 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import words
nltk.download('words')
english_words = set(words.words())
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dotronghiep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dotronghiep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/dotronghiep/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [106]:
def preprocess(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove non-alphabetic characters and stopwords
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token, pos='v') for token in tokens]    
    
    # Remove words not in English
    english_words = set(nltk.corpus.words.words())
    tokens = [token for token in tokens if token in english_words]
    
    return tokens

In [107]:
from tqdm import tqdm

X = [None] * len(X_raw)
for i in tqdm(range(len(X_raw))):
    X[i] = preprocess(X_raw[i])

X

  0%|          | 0/5572 [00:00<?, ?it/s]

100%|██████████| 5572/5572 [04:44<00:00, 19.56it/s]


[['go',
  'point',
  'crazy',
  'available',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'get',
  'wat'],
 ['lar', 'joke', 'u'],
 ['free',
  'entry',
  'win',
  'fa',
  'cup',
  'final',
  'may',
  'text',
  'fa',
  'receive',
  'entry',
  'question',
  'rate',
  'c',
  'apply'],
 ['u', 'dun', 'say', 'early', 'u', 'c', 'already', 'say'],
 ['think', 'go', 'live', 'around', 'though'],
 ['hey', 'darling', 'week', 'word', 'back', 'like', 'fun', 'still', 'send'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per', 'request', 'set', 'press', 'copy'],
 ['winner',
  'value',
  'network',
  'customer',
  'select',
  'prize',
  'reward',
  'claim',
  'call',
  'claim',
  'code',
  'valid'],
 ['mobile',
  'u',
  'r',
  'entitle',
  'update',
  'latest',
  'colour',
  'camera',
  'free',
  'call',
  'mobile',
  'update',
  'free'],
 ['gon',
  'na',
  'home',
  'soon',
  'want',
  'talk',
  'stuff',
  'tonight',
  'k',
  'cry',
  'enough',
  'to

In [79]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the y_raw array
y = label_encoder.fit_transform(y_raw)

y_raw, y

(array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object),
 array([0, 0, 1, ..., 0, 0, 0]))

In [108]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the lengths of the training and testing sets
print("Training set length:", len(X_train))
print("Testing set length:", len(X_test))


Training set length: 4457
Testing set length: 1115


In [109]:
# Count the occurrences of each word in the positive and negative classes
positive_word_counts = {}
negative_word_counts = {}
vocab = []
positive_counts = 0
negative_counts = 0

for i in range(len(X_train)):
    for word in X_train[i]:
        if y_train[i] == 1:
            positive_word_counts[word] = positive_word_counts.get(word, 0) + 1
            positive_counts += 1
        else:
            negative_word_counts[word] = negative_word_counts.get(word, 0) + 1
            negative_counts += 1
        vocab.append(word)

vocab = set(vocab)

# Calculate the total count of words in the positive and negative classes
total_positive_words = sum(positive_word_counts.values())
total_negative_words = sum(negative_word_counts.values())

# Calculate the probabilities for each word
word_probabilities = {}

for word in vocab:
    positive_probability = (positive_word_counts.get(word, 0) + 1) / (total_positive_words + len(vocab))
    negative_probability = (negative_word_counts.get(word, 0) + 1) / (total_negative_words + len(vocab))
    word_probabilities[word] = [positive_probability, negative_probability]

p_positive = positive_counts / len(X_train)
p_negative = negative_counts / len(X_train)

word_probabilities


{'forever': [0.00010996261271167803, 0.00022149211857211415],
 'moment': [0.00010996261271167803, 0.0003322381778581712],
 'end': [0.002419177479656917, 0.00125512200524198],
 'easy': [0.0009896635144051023, 0.00062756100262099],
 'sol': [0.0007697382889817463, 3.6915353095352356e-05],
 'esplanade': [0.00010996261271167803, 0.00014766141238140942],
 'customer': [0.004288541895755443, 0.0002584074716674665],
 'bed': [0.00010996261271167803, 0.0009597991804791612],
 'side': [0.00010996261271167803, 0.0004798995902395806],
 'train': [0.00021992522542335605, 0.000516814943334933],
 'prestige': [0.00010996261271167803, 7.383070619070471e-05],
 'diesel': [0.00010996261271167803, 7.383070619070471e-05],
 'normal': [0.00021992522542335605, 0.0003322381778581712],
 'handset': [0.0005498130635583901, 3.6915353095352356e-05],
 'sooner': [0.00010996261271167803, 0.00014766141238140942],
 'mode': [0.00010996261271167803, 0.00029532282476281885],
 'hearted': [0.00010996261271167803, 7.38307061907047

In [110]:
def remove_unkown_word(text):
    for i in text:
        if i not in vocab:
            text.remove(i)
    return text

In [115]:
X_test_ok = [None] * len(X_test)

for i in range(len(X_test)):
    X_test_ok[i] = remove_unkown_word(X_test[i])

X_test_ok

[['hug',
  'u',
  'den',
  'hug',
  'back',
  'u',
  'get',
  'u',
  'r',
  'cute',
  'u',
  'r',
  'u',
  'r',
  'lucky',
  'none',
  'people',
  'hate',
  'u'],
 ['also',
  'blow',
  'couple',
  'time',
  'recently',
  'id',
  'rather',
  'text',
  'blue',
  'look',
  'weed'],
 ['thats', 'better', 'get', 'roast', 'b', 'better', 'drink', 'good'],
 ['dont', 'eat', 'anything', 'heavy'],
 ['ring', 'come', 'guy', 'costume', 'gift', 'future', 'hint', 'hint'],
 ['need', 'hurt', 'lot'],
 ['love',
  'decision',
  'feel',
  'could',
  'decide',
  'love',
  'life',
  'would',
  'much',
  'simpler',
  'less',
  'magical'],
 ['supervisor',
  'find',
  'one',
  'havent',
  'ask',
  'yet',
  'tell',
  'u',
  'aft',
  'ask'],
 ['dear', 'good', 'morning'],
 [],
 ['take', 'forever', 'like', 'away', 'ugh'],
 ['huh', 'proof', 'page', 'ugh', 'glad', 'really', 'watch', 'show', 'tool'],
 ['buy', 'lar'],
 ['er', 'hello', 'quite', 'go', 'plan', 'slowly', 'home', 'follow', 'exhaust'],
 ['free',
  'text',
  'f

In [112]:
def test(text):
    positive_probability = p_positive
    negative_probability = p_negative
    for word in text:
        positive_probability *= word_probabilities.get(word, [1 / (total_positive_words + len(vocab)), 1 / (total_negative_words + len(vocab))])[0]
        negative_probability *= word_probabilities.get(word, [1 / (total_positive_words + len(vocab)), 1 / (total_negative_words + len(vocab))])[1]
    return positive_probability > negative_probability

In [119]:
correct_predictions = 0
total_predictions = len(X_test_ok)

for i in range(total_predictions):
    if test(X_test_ok[i]) == bool(y_test[i]):
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
accuracy


0.9650224215246637