# Importing the liberaries 

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




# Loding the dataset 

In [45]:
# Load data
data = pd.read_csv('spam.csv', encoding='latin-1')
print(data.head())
print(data.columns)

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [46]:
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

In [47]:
# Convert labels to binary (0 for ham, 1 for spam)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [48]:
data.isnull().sum()

label      0
message    0
dtype: int64

# Text Preprocessing

### Convert text to lowercase. Remove punctuation and special characters.Tokenize the messages (split into words).

In [49]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

def stopword(text) : 
    txt = [word.lower() for word in text.split() if word.lower() not in sw]
    return txt 

In [50]:
data['message']=data['message'].apply(stopword)
data.head()

Unnamed: 0,label,message
0,0,"[go, jurong, point,, crazy.., available, bugis..."
1,0,"[ok, lar..., joking, wif, u, oni...]"
2,1,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,0,"[u, dun, say, early, hor..., u, c, already, sa..."
4,0,"[nah, think, goes, usf,, lives, around, though]"


### Apply stemming

In [51]:
from nltk.stem.snowball import SnowballStemmer 

ss = SnowballStemmer("english")

def stemming(text) : 
    text = [ss.stem(word) for word in text if word.split()]
    return "".join(text)

In [52]:
data['message']=data['message'].apply(stemming)
data.head()

Unnamed: 0,label,message
0,0,"gojurongpoint,crazy..availbugingreatworldlaebu..."
1,0,oklar...jokewifuoni...
2,1,freeentri2wklicompwinfacupfinaltkts21stmay2005...
3,0,udunsayearlihor...ucalreadisay...
4,0,"nahthinkgoeusf,livearoundthough"


# Vectorization 

### TF-IDF

In [53]:
tfid_vect = TfidfVectorizer()

tfid_matrix = tfid_vect.fit_transform(data['message'])

print(f"Type :{type(tfid_matrix)} , Matrix at 0 : {tfid_matrix[0]} , Shape : {tfid_matrix.shape}")

Type :<class 'scipy.sparse._csr.csr_matrix'> , Matrix at 0 : <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (1, 11889)>
  Coords	Values
  (0, 3730)	0.5056391989470028
  (0, 2228)	0.48268727087494234
  (0, 1057)	0.5056391989470028
  (0, 1882)	0.5056391989470028 , Shape : (5572, 11889)


# Split and train dataset

In [55]:
X = tfid_matrix  
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [58]:

X_train.shape,y_train.shape,X_test.shape,y_test.shape

((4457, 11889), (4457,), (1115, 11889), (1115,))

In [59]:
y_pred = model.predict(X_test)


In [60]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Classification Report (includes precision, recall, f1-score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9210762331838565

Confusion Matrix:
 [[965   0]
 [ 88  62]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       965
           1       1.00      0.41      0.58       150

    accuracy                           0.92      1115
   macro avg       0.96      0.71      0.77      1115
weighted avg       0.93      0.92      0.91      1115



# Saving the Model

In [62]:
import pickle

# Save the trained model
pickle.dump(model, open("spam_classifier_model.pkl", "wb"))

# Save the TF-IDF vectorizer
pickle.dump(tfid_vect, open("tfidf_vectorizer.pkl", "wb"))

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!
