# 1. Data Extraction


In [84]:
#importing libraries
import pandas as pd




In [85]:
# Load the dataset and we use encoding='latin-1' because to avoid UnicodeDecodeError
data = pd.read_csv('spam.csv', encoding='latin-1')

Now that the data is loaded, let's take a look at the first few rows to make sure everything is in order.

In [86]:
# View the top 5 rows
data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [87]:
#check the size of the dataset (number of rows, number of columns)
data.shape

(5572, 5)

# 2. Data Cleaning


In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [89]:
# Drop irrelevant columns
data = data[['v1', 'v2']]  # v1 = label, v2 = message

# Rename columns for clarity
data.columns = ['Label', 'Message']

# Check and remove null values
data.isnull().sum()
data = data.dropna()

# Remove duplicates
data = data.drop_duplicates()


In [90]:
data.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [91]:
data.shape

(5169, 2)

# 3. Data Analysis

In [92]:
# Count labels
data['Label'].value_counts()



Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,4516
spam,653


In [93]:
# Create new features

# Create a new column 'num_characters' to store the total number of characters in each message
# The len() function returns the number of characters in the string
data['num_characters'] = data['Message'].apply(len)

# Create a new column 'num_words' to store the total number of words in each message
# We use split() to divide the message into a list of words, and then count the length of that list
data['num_words'] = data['Message'].apply(lambda x: len(x.split()))

# Create a new column 'num_sentences' to store the number of sentences in each message
# nltk.sent_tokenize() splits the message into sentences using NLP rules
# Then we count the number of sentences using len()
data['num_sentences'] = data['Message'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [94]:
data.head()

Unnamed: 0,Label,Message,num_characters,num_words,num_sentences
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,2
1,ham,Ok lar... Joking wif u oni...,29,6,2
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,2
3,ham,U dun say so early hor... U c already then say...,49,11,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,13,1


# 4.Text Preprocessing

In [95]:
import nltk
from nltk.corpus import stopwords # to remove common words
from nltk.stem.porter import PorterStemmer  # for stemming: reducing words to their base form
import string # for removing punctuation marks

# Initialize the stemmer
ps = PorterStemmer()

#function to perform all the preprocessing steps on text data
def transform_text(text):
    text = text.lower()                            # convert to lowercase
    text = nltk.word_tokenize(text)                # Split the text  into words
    text = [word for word in text if word.isalnum()]  # remove special characters
    text = [word for word in text if word not in stopwords.words('english') and word not in string.punctuation]  # remove stopwords and punctuation
    text = [ps.stem(word) for word in text]        # stemming
    return " ".join(text)                          # Join the cleaned words back into a single string

# Apply to all messages
data['transformed_message'] = data['Message'].apply(transform_text)


In [96]:
data.head()


Unnamed: 0,Label,Message,num_characters,num_words,num_sentences,transformed_message
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,2,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,29,6,2,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,2,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,49,11,1,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,13,1,nah think goe usf live around though


In [97]:
# Import two text vectorization tools from scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Count Vectorizer: Simple bag-of-words approach
# Create an instance of CountVectorizer
# This will convert each message into a vector of word counts
cv = CountVectorizer()

# Apply CountVectorizer on the preprocessed text column
# fit_transform() does two things:
#   - Builds a vocabulary of all words in the corpus
#   - Transforms each message into a numeric array based on word counts
X_bow = cv.fit_transform(data['transformed_message']).toarray()


# TF-IDF Vectorizer: Better for weighting important words
# Create a TfidfVectorizer instance
# max_features=3000 limits the number of unique words to 3000
tfidf = TfidfVectorizer(max_features=3000)

# Apply TF-IDF on the same preprocessed text
# This method gives higher weight to important, rare words and lower weight to frequent ones
X = tfidf.fit_transform(data['transformed_message']).toarray()
