In [1]:
import os
print(os.getcwd())

C:\Users\irumj\NLP


In [2]:
# importing the Dataset
import pandas as pd
file_path = r'C:\Users\irumj\NLP\SMSSpamCollection.txt'
df = pd.read_csv(file_path, sep='\t', names=["label", "message"])

In [3]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df.shape

(5572, 2)

In [5]:
#Data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\irumj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\irumj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\irumj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [8]:
# Initialize stop words, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Apply stemming and lemmatization
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [10]:
df['message'] = df['message'].apply(preprocess_text)

In [11]:
print(df.head())

  label                                            message
0   ham  go jurong point crazi avail bugi n great world...
1   ham                              ok lar joke wif u oni
2  spam  free entri wkli comp win fa cup final tkt st m...
3   ham                u dun say earli hor u c alreadi say
4   ham               nah think goe usf live around though


In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize the vectorizers
count_vectorizer = CountVectorizer(ngram_range=(1, 2))  
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) 

# Fit and transform the training data
X_train_counts = count_vectorizer.fit_transform(X_train)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_counts = count_vectorizer.transform(X_test)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB()
}

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Function to train and evaluate a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, vectorizer_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1 = f1_score(y_test, y_pred, pos_label='spam')
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    print(f"{model_name} with {vectorizer_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")


In [18]:
# Train and evaluate models using CountVectorizer
for model_name, model in models.items():
    train_evaluate_model(model, X_train_counts, y_train, X_test_counts, y_test, model_name, 'CountVectorizer')

Logistic Regression with CountVectorizer - Accuracy: 0.9812, Precision: 1.0000, Recall: 0.8591, F1 Score: 0.9242, ROC-AUC: 0.9883
Naive Bayes with CountVectorizer - Accuracy: 0.9848, Precision: 0.9783, Recall: 0.9060, F1 Score: 0.9408, ROC-AUC: 0.9755


In [19]:
# Train and evaluate models using TfidfVectorizer
for model_name, model in models.items():
    train_evaluate_model(model, X_train_tfidf, y_train, X_test_tfidf, y_test, model_name, 'TfidfVectorizer')

Logistic Regression with TfidfVectorizer - Accuracy: 0.9632, Precision: 1.0000, Recall: 0.7248, F1 Score: 0.8405, ROC-AUC: 0.9890
Naive Bayes with TfidfVectorizer - Accuracy: 0.9632, Precision: 1.0000, Recall: 0.7248, F1 Score: 0.8405, ROC-AUC: 0.9725


Key Insights:
CountVectorizer:

Both Logistic Regression and Naive Bayes performed very well.
Naive Bayes showed slightly better overall performance with the highest accuracy (0.9848) and F1 Score (0.9408).
Logistic Regression had perfect precision but lower recall, indicating it was very precise but missed some spam instances.

TfidfVectorizer:

Both models showed lower performance compared to using CountVectorizer.
Logistic Regression and Naive Bayes had identical scores for accuracy, precision, recall, and F1 score.
Precision remained perfect, but recall was significantly lower, suggesting the models missed more spam instances.

Conclusion:
Naive Bayes with CountVectorizer provided the best overall performance, with the highest accuracy and a good balance between precision and recall.
Using CountVectorizer generally resulted in better performance metrics than TfidfVectorizer for this dataset and task.