In [5]:

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
import pickle


df = pd.read_csv('spam.csv', encoding='latin-1')
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'], inplace=True)
df.rename(columns={'v1':'target','v2':'text'}, inplace=True)


from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])


df = df.drop_duplicates(keep='first')


def transform_text(text):
    text = text.lower()                    
    text = re.sub(r'[^a-z0-9\s]', '', text) 
    text = text.split()                        
    stop_words = set([
        'i','me','my','myself','we','our','ours','ourselves','you','your','yours',
        'yourself','yourselves','he','him','his','himself','she','her','hers',
        'herself','it','its','itself','they','them','their','theirs','themselves',
        'what','which','who','whom','this','that','these','those','am','is','are',
        'was','were','be','been','being','have','has','had','having','do','does',
        'did','doing','a','an','the','and','but','if','or','because','as','until',
        'while','of','at','by','for','with','about','against','between','into',
        'through','during','before','after','above','below','to','from','up','down',
        'in','out','on','off','over','under','again','further','then','once','here',
        'there','when','where','why','how','all','any','both','each','few','more',
        'most','other','some','such','no','nor','not','only','own','same','so',
        'than','too','very','s','t','can','will','just','don','should','now'
    ])
    text = [word for word in text if word not in stop_words] 
    
    text = [re.sub(r'(ing|ly|ed|s)$','',word) for word in text]
    return " ".join(text)

df['transformed_text'] = df['text'].apply(transform_text)


tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)


mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print("Naive Bayes -> Accuracy:", accuracy_score(y_test, y_pred))
print("Naive Bayes -> Precision:", precision_score(y_test, y_pred))


svc = SVC(kernel='sigmoid', gamma=1.0, probability=True)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)], voting='soft')
voting.fit(X_train, y_train)
y_pred = voting.predict(X_test)
print("Voting Classifier -> Accuracy:", accuracy_score(y_test, y_pred))
print("Voting Classifier -> Precision:", precision_score(y_test, y_pred))


estimators = [('svm', svc), ('nb', mnb), ('et', etc)]
stacking = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier())
stacking.fit(X_train, y_train)
y_pred = stacking.predict(X_test)
print("Stacking Classifier -> Accuracy:", accuracy_score(y_test, y_pred))
print("Stacking Classifier -> Precision:", precision_score(y_test, y_pred))


pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
pickle.dump(mnb, open('model.pkl', 'wb'))


Naive Bayes -> Accuracy: 0.9748549323017408
Naive Bayes -> Precision: 1.0
Voting Classifier -> Accuracy: 0.9806576402321083
Voting Classifier -> Precision: 0.9836065573770492
Stacking Classifier -> Accuracy: 0.9796905222437138
Stacking Classifier -> Precision: 0.9534883720930233
