In [33]:
# Import the pandas library for data manipulation and analysis, especially for working with DataFrames
import pandas as pd

In [34]:
# Read the CSV file 'spamclassification.csv' from the 'Datasets' folder using pandas
# Specifying encoding='latin1' to avoid UnicodeDecodeError with special characters
messages = pd.read_csv('Datasets/spamclassification.csv', encoding='latin1')

# Display the first 5 rows of the DataFrame to get a quick look at the data
messages.head()

Unnamed: 0,Label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning and Preprocessing

In [None]:
# Import the regular expression module for text cleaning
import re

# Import the Natural Language Toolkit (nltk) library for text processing
import nltk

# Import a list of common English stopwords (e.g., "the", "is", "and") from nltk
from nltk.corpus import stopwords

# Import the Porter Stemmer algorithm from nltk for stemming words (reducing words to their base/root form)
from nltk.stem.porter import PorterStemmer

# Initialize the PorterStemmer object
ps = PorterStemmer()

In [28]:
corpus = []  # Initialize an empty list to store the cleaned and processed text data

for i in range(0, len(messages)):
    # Remove all characters except alphabets from the message text
    review = re.sub('[^a-zA-Z]', ' ', messages['text'][i])

    # Convert all characters to lowercase
    review = review.lower()

    # Split the sentence into individual words (tokenization)
    review = review.split()

    # Apply stemming to each word and remove stopwords 
    review = [ps.stem(word) for word in review if word in stopwords.words('english')]

    # Join the processed words back into a single string
    review = ' '.join(review)

    # Append the cleaned review to the corpus list
    corpus.append(review)


## Create Bag of Words Model

In [37]:
# Import the CountVectorizer class from scikit-learn's text feature extraction module
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with:
# - max_features=2500: limit the vocabulary to the 2,500 most frequent words
# - binary=True: represent word presence as 1 (present) or 0 (absent), rather than word counts
cv = CountVectorizer(max_features=100, binary=True)


In [None]:
# Transform the cleaned corpus into a binary feature matrix using CountVectorizer
X = cv.fit_transform(corpus).toarray()

# Display the shape of the resulting feature matrix (number of messages, number of features)
X.shape

(5574, 100)