In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [7]:
print("No. of rows:", df.shape[0])

No. of rows: 5572


In [13]:
# Importing required libraries
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [15]:
def preprocess_text(msg):
    msg = msg.lower()
    msg = re.sub('[^a-z\s]', '', msg)
    # Tokenization
    tokens = word_tokenize(msg)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    # Lemmaization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

In [16]:
df['clean_message'] = df['message'].apply(preprocess_text)

In [17]:
df.head()

Unnamed: 0,label,message,clean_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


In [19]:
# Bag of Words
cv = CountVectorizer()
X_bow = cv.fit_transform(df['clean_message'])

In [20]:
cv.vocabulary_

{'go': 2668,
 'jurong': 3510,
 'point': 5129,
 'crazy': 1455,
 'available': 460,
 'bugis': 887,
 'great': 2756,
 'world': 7698,
 'la': 3657,
 'buffet': 885,
 'cine': 1192,
 'got': 2720,
 'amore': 233,
 'wat': 7474,
 'ok': 4712,
 'lar': 3690,
 'joking': 3480,
 'wif': 7599,
 'oni': 4743,
 'free': 2467,
 'entry': 2077,
 'wkly': 7654,
 'comp': 1300,
 'win': 7612,
 'fa': 2207,
 'cup': 1510,
 'final': 2320,
 'tkts': 6939,
 'st': 6393,
 'may': 4097,
 'text': 6785,
 'receive': 5512,
 'questionstd': 5407,
 'txt': 7140,
 'ratetcs': 5459,
 'apply': 320,
 'over': 4835,
 'dun': 1936,
 'say': 5837,
 'early': 1956,
 'hor': 3072,
 'already': 205,
 'nah': 4422,
 'dont': 1851,
 'think': 6856,
 'usf': 7287,
 'life': 3781,
 'around': 367,
 'though': 6876,
 'freemsg': 2475,
 'hey': 2973,
 'darling': 1571,
 'week': 7522,
 'word': 7686,
 'back': 503,
 'id': 3183,
 'like': 3796,
 'fun': 2538,
 'still': 6457,
 'tb': 6719,
 'xxx': 7824,
 'std': 6433,
 'chgs': 1140,
 'send': 5923,
 'rcv': 5470,
 'even': 2123,
 '

In [23]:
X_bow[0].toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['clean_message'])

In [22]:
tfidf.vocabulary_

{'go': 2668,
 'jurong': 3510,
 'point': 5129,
 'crazy': 1455,
 'available': 460,
 'bugis': 887,
 'great': 2756,
 'world': 7698,
 'la': 3657,
 'buffet': 885,
 'cine': 1192,
 'got': 2720,
 'amore': 233,
 'wat': 7474,
 'ok': 4712,
 'lar': 3690,
 'joking': 3480,
 'wif': 7599,
 'oni': 4743,
 'free': 2467,
 'entry': 2077,
 'wkly': 7654,
 'comp': 1300,
 'win': 7612,
 'fa': 2207,
 'cup': 1510,
 'final': 2320,
 'tkts': 6939,
 'st': 6393,
 'may': 4097,
 'text': 6785,
 'receive': 5512,
 'questionstd': 5407,
 'txt': 7140,
 'ratetcs': 5459,
 'apply': 320,
 'over': 4835,
 'dun': 1936,
 'say': 5837,
 'early': 1956,
 'hor': 3072,
 'already': 205,
 'nah': 4422,
 'dont': 1851,
 'think': 6856,
 'usf': 7287,
 'life': 3781,
 'around': 367,
 'though': 6876,
 'freemsg': 2475,
 'hey': 2973,
 'darling': 1571,
 'week': 7522,
 'word': 7686,
 'back': 503,
 'id': 3183,
 'like': 3796,
 'fun': 2538,
 'still': 6457,
 'tb': 6719,
 'xxx': 7824,
 'std': 6433,
 'chgs': 1140,
 'send': 5923,
 'rcv': 5470,
 'even': 2123,
 '

In [24]:
X_tfidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])