In [None]:
# importing all the required libraries for analyzing text

import pandas as pd   #for importing data into ipynb file
import numpy as np     #for making matrix and used of confusion matrix
from sklearn.model_selection import train_test_split    #for splitting data into train and test
from sklearn.feature_extraction.text import TfidfVectorizer    #for seperating features from text
from sklearn.naive_bayes import MultinomialNB     #this is a classification model
from sklearn.linear_model import LogisticRegression     #this is a classification model
from sklearn.svm import SVC           #this is also a classification model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix      #checking last accuracy of model and whole report


In [None]:
# Load the dataset (assuming it's in CSV format)
data = pd.read_csv('/content/spam.csv', encoding='latin-1')     #The use of encoding='latin-1' is relevant when loading datasets that contain special characters or non-ASCII content.

# Clean the data (keep only relevant columns)
data = data[['v1', 'v2']]
data.columns = ['label', 'message']    #here we just change name of original columns and replace these by label and message

# Convert labels to binary: ham -> 0, spam -> 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})     #here we convert label column into binary 0 or 1 output
print(data.head())    #it is just for printing starting 5 rows


   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [None]:
X = data['message']   #X is a full message column data
y = data['label']     #y is a full label column data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)    #here we separate data for training and testing like 80% and 20%  respectively


In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
# it converts SMS text into numerical features using TF-IDF
# Fit and transform on training data, only transform on test data
X_train_tfidf = vectorizer.fit_transform(X_train)         #fit_transform does both:

    # Learns from the data (fit),
    # Applies the transformation (transform) on the training set to convert it into TF-IDF vectors.
X_test_tfidf = vectorizer.transform(X_test)            # we use only transform because the TfidfVectorizer has already learned the vocabulary and IDF values from X_train during the fitting stage.


In [None]:
nb_model = MultinomialNB()    #here we used naive bays classifier
nb_model.fit(X_train_tfidf, y_train)   #here we train our model by providing input and output like x_train and y_train

nb_predictions = nb_model.predict(X_test_tfidf)   #now we provide input to trained model and making predictions
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))    #this is final step for this model and this is the result

Naive Bayes Accuracy: 0.9668161434977578
