In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
file_path = '/content/drive/MyDrive/spam.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1', usecols=['v1', 'v2'])
print(data.head())

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


Data Preprocessing:

In [None]:
pip install nltk



In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# Apply preprocessing to the 'v2' column 
data['processed_text'] = data['v2'].apply(preprocess_text)

# Display the processed text and original text for comparison
print("Original Text:")
print(data['v2'].head())
print("\nProcessed Text:")
print(data['processed_text'].head())


Original Text:
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

Processed Text:
0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts st ...
3                  u dun say early hor u c already say
4                  nah think go usf life around though
Name: processed_text, dtype: object


In [None]:
data

Unnamed: 0,v1,v2,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [None]:

# Separate features (processed text) and labels (v1 - spam/ham)
X = data['processed_text']
y = data['v1']


In [None]:

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Building:

In [None]:

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)


In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Display the shapes of the resulting matrices
print("Count Vectorization - Training set shape:", X_train_count.shape)
print("Count Vectorization - Testing set shape:", X_test_count.shape)
print("\nTF-IDF Vectorization - Training set shape:", X_train_tfidf.shape)
print("TF-IDF Vectorization - Testing set shape:", X_test_tfidf.shape)

Count Vectorization - Training set shape: (4457, 6262)
Count Vectorization - Testing set shape: (1115, 6262)

TF-IDF Vectorization - Training set shape: (4457, 6262)
TF-IDF Vectorization - Testing set shape: (1115, 6262)


In [None]:
# Naive bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Initialize Naive Bayes classifier
nb_classifier = MultinomialNB()

In [None]:
# Train the classifier using Count Vectorized features
nb_classifier.fit(X_train_count, y_train)


Model Evaluation:

In [None]:
# Predict on the testing set
nb_pred_count = nb_classifier.predict(X_test_count)

In [None]:

# Evaluate performance
accuracy_count = accuracy_score(y_test, nb_pred_count)
print("Naive Bayes - Count Vectorization Accuracy:", accuracy_count)
print("\nClassification Report:")
print(classification_report(y_test, nb_pred_count))


Naive Bayes - Count Vectorization Accuracy: 0.9820627802690582

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       965
        spam       0.95      0.92      0.93       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation with Naive Bayes using Count Vectorization
nb_cv_scores_count = cross_val_score(nb_classifier, X_train_count, y_train, cv=5)
print("Naive Bayes - Count Vectorization Cross-Validation Scores:")
print(nb_cv_scores_count)
print("Mean Accuracy:", nb_cv_scores_count.mean())


Naive Bayes - Count Vectorization Cross-Validation Scores:
[0.9764574  0.97982063 0.97755331 0.97194164 0.96969697]
Mean Accuracy: 0.9750939892195498


In [None]:
# Confusion matrix for Naive Bayes with Count Vectorization
from sklearn.metrics import confusion_matrix

conf_matrix_nb_count = confusion_matrix(y_test, nb_pred_count)
print("\nConfusion Matrix - Naive Bayes with Count Vectorization:")
print(conf_matrix_nb_count)



Confusion Matrix - Naive Bayes with Count Vectorization:
[[957   8]
 [ 12 138]]


In [None]:
# Retrain the Naive Bayes classifier with the best parameters on the entire dataset
best_nb_classifier = MultinomialNB(alpha=1.0)  
X_full_count = count_vectorizer.fit_transform(X)  
y_full = data['v1']  

In [None]:
best_nb_classifier.fit(X_full_count, y_full)

Prediction and Deployment:

In [None]:
# Now, let's predict on new SMS messages
new_messages = [
    "Congratulations! You've won a prize. Click the link to claim.",
    "Hey, are you free this weekend? Let's catch up."
]

In [None]:
# Preprocess the new messages
preprocessed_new_messages = [preprocess_text(msg) for msg in new_messages]


In [None]:
# Vectorize the preprocessed new messages using the previously fitted CountVectorizer
X_new_count = count_vectorizer.transform(preprocessed_new_messages)

In [None]:
# Predict if the new messages are spam or ham
predictions = best_nb_classifier.predict(X_new_count)

In [None]:
# Display predictions
for message, prediction in zip(new_messages, predictions):
    print(f"Message: {message}")
    print(f"Prediction: {'Spam' if prediction == 'spam' else 'Ham'}")
    print()

Message: Congratulations! You've won a prize. Click the link to claim.
Prediction: Spam

Message: Hey, are you free this weekend? Let's catch up.
Prediction: Ham

