In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
sample = pd.read_csv('/Users/minkhant/Documents/Projects/Natural Language Processing/data/raw/sample_spam_ham.csv')

In [3]:
sample.head()

Unnamed: 0,label,text
0,ham,"Hey, are we still meeting for lunch today?"
1,ham,I'll call you after the meeting finishes.
2,ham,Can you send me the report by tonight?
3,ham,Let's catch up this weekend at the park.
4,ham,Don't forget to bring your notebook tomorrow.


In [4]:
sample['target'] = sample['label'].map({
    'ham' : 0,
    'spam' : 1}
)

In [5]:
sample.head()

Unnamed: 0,label,text,target
0,ham,"Hey, are we still meeting for lunch today?",0
1,ham,I'll call you after the meeting finishes.,0
2,ham,Can you send me the report by tonight?,0
3,ham,Let's catch up this weekend at the park.,0
4,ham,Don't forget to bring your notebook tomorrow.,0


In [6]:
URL_RE   = r"(https?://\S+|www\.\S+)"
EMAIL_RE = r"\b[\w\.-]+@[\w\.-]+\.\w+\b"
PHONE_RE = r"\b(?:\+?\d{1,3})?[-.\s]?(?:\(?\d{2,4}\)?)[-.\s]?\d{3,4}[-.\s]?\d{3,4}\b"
CURR_RE  = r"[$£€฿]"
NUM_RE   = r"\b\d+(?:[\.,]\d+)?\b"
USER_RE  = r"@\w+"
HASH_RE  = r"#\w+"

def clean_text(s: str) -> str:
    s = str(s).lower()
    s = re.sub(URL_RE, " <url> ", s)
    s = re.sub(EMAIL_RE, " <email> ", s)
    s = re.sub(PHONE_RE, " <phone> ", s)
    s = re.sub(CURR_RE, " <currency> ", s)
    s = re.sub(USER_RE, " <user> ", s)
    s = re.sub(HASH_RE, lambda m: " " + m.group(0).replace("#","<hashtag_") + "> ", s)
    s = re.sub(NUM_RE, " <number> ", s)
    s = re.sub(r"[^a-z0-9<>\s'!?.,-]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [7]:
sample['clean_text'] = sample.text.apply(clean_text)

In [8]:
sample.head()

Unnamed: 0,label,text,target,clean_text
0,ham,"Hey, are we still meeting for lunch today?",0,"hey, are we still meeting for lunch today?"
1,ham,I'll call you after the meeting finishes.,0,i'll call you after the meeting finishes.
2,ham,Can you send me the report by tonight?,0,can you send me the report by tonight?
3,ham,Let's catch up this weekend at the park.,0,let's catch up this weekend at the park.
4,ham,Don't forget to bring your notebook tomorrow.,0,don't forget to bring your notebook tomorrow.


In [9]:
sample.tail()

Unnamed: 0,label,text,target,clean_text
5,spam,Congratulations! You've won a free iPhone. Cli...,1,congratulations! you've won a free iphone. cli...
6,spam,Get cheap loans approved instantly. Apply now!,1,get cheap loans approved instantly. apply now!
7,spam,Exclusive deal just for you. Claim your prize ...,1,exclusive deal just for you. claim your prize ...
8,spam,You have been selected for a cash reward. Visi...,1,you have been selected for a cash reward. visi...
9,spam,URGENT! Verify your account to avoid suspension.,1,urgent! verify your account to avoid suspension.


In [10]:
v = TfidfVectorizer(ngram_range=(1,2),
                    lowercase=True,
                    max_features=5000,
                    stop_words='english')

In [11]:
vectorized_text = v.fit_transform(sample['clean_text'])

In [12]:
v.vocabulary_

{'hey': 31,
 'meeting': 48,
 'lunch': 46,
 'today': 65,
 'hey meeting': 32,
 'meeting lunch': 50,
 'lunch today': 47,
 'll': 42,
 'finishes': 26,
 'll meeting': 43,
 'meeting finishes': 49,
 'send': 62,
 'report': 56,
 'tonight': 67,
 'send report': 63,
 'report tonight': 57,
 'let': 39,
 'catch': 11,
 'weekend': 76,
 'park': 53,
 'let catch': 40,
 'catch weekend': 12,
 'weekend park': 77,
 'don': 22,
 'forget': 27,
 'bring': 7,
 'notebook': 51,
 'tomorrow': 66,
 'don forget': 23,
 'forget bring': 28,
 'bring notebook': 8,
 'notebook tomorrow': 52,
 'congratulations': 18,
 've': 70,
 'won': 78,
 'free': 29,
 'iphone': 35,
 'click': 17,
 'congratulations ve': 19,
 've won': 71,
 'won free': 79,
 'free iphone': 30,
 'iphone click': 36,
 'cheap': 13,
 'loans': 44,
 'approved': 3,
 'instantly': 33,
 'apply': 2,
 'cheap loans': 14,
 'loans approved': 45,
 'approved instantly': 4,
 'instantly apply': 34,
 'exclusive': 24,
 'deal': 20,
 'just': 37,
 'claim': 15,
 'prize': 54,
 'exclusive deal

In [13]:
v.get_feature_names_out()

array(['account', 'account avoid', 'apply', 'approved',
       'approved instantly', 'avoid', 'avoid suspension', 'bring',
       'bring notebook', 'cash', 'cash reward', 'catch', 'catch weekend',
       'cheap', 'cheap loans', 'claim', 'claim prize', 'click',
       'congratulations', 'congratulations ve', 'deal', 'deal just',
       'don', 'don forget', 'exclusive', 'exclusive deal', 'finishes',
       'forget', 'forget bring', 'free', 'free iphone', 'hey',
       'hey meeting', 'instantly', 'instantly apply', 'iphone',
       'iphone click', 'just', 'just claim', 'let', 'let catch', 'link',
       'll', 'll meeting', 'loans', 'loans approved', 'lunch',
       'lunch today', 'meeting', 'meeting finishes', 'meeting lunch',
       'notebook', 'notebook tomorrow', 'park', 'prize', 'prize today',
       'report', 'report tonight', 'reward', 'reward visit', 'selected',
       'selected cash', 'send', 'send report', 'suspension', 'today',
       'tomorrow', 'tonight', 'urgent', 'urgent ver

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
X = vectorized_text
y = sample['target']
lrc = LogisticRegression()

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

In [17]:
lrc.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [18]:
X_train.shape

(8, 80)

In [19]:
y_pred = lrc.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

0.5
[[0 1]
 [0 1]]


In [21]:
spam_mail_test = ["Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize.", "Hey, are we still meeting for lunch today?"]
spam_mail_test_vectorized = v.transform(spam_mail_test)

In [22]:
spam_mail_test_vectorized

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (2, 80)>

In [23]:
print("Logistic :", lrc.predict(spam_mail_test_vectorized))

Logistic : [1 0]
