In [41]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [42]:
df=pd.read_csv('spamdata.csv')

In [43]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [46]:
spam_keywords = [
    'click here', 'free', 'limited time', 'win', 'congratulations', 
    'exclusive offer', 'urgent', 'act now', '100% guarantee', 'buy now', 'ends soon', 'offer', 'deal ends soon'
]

In [52]:
def is_spam(text):
    if isinstance(text,str):
        for keyword in spam_keywords:
            if keyword in text.lower():
                return 1
    return 0

In [54]:
df['spam'] = df.apply(lambda x: is_spam(x['Message']) or is_spam(x['Category']), axis=1)

In [56]:
print(df['spam'].value_counts())
print(df[['Message','Category','spam']].head(10))

spam
0    5062
1     510
Name: count, dtype: int64
                                             Message Category  spam
0  Go until jurong point, crazy.. Available only ...      ham     0
1                      Ok lar... Joking wif u oni...      ham     0
2  Free entry in 2 a wkly comp to win FA Cup fina...     spam     1
3  U dun say so early hor... U c already then say...      ham     0
4  Nah I don't think he goes to usf, he lives aro...      ham     0
5  FreeMsg Hey there darling it's been 3 week's n...     spam     1
6  Even my brother is not like to speak with me. ...      ham     0
7  As per your request 'Melle Melle (Oru Minnamin...      ham     0
8  WINNER!! As a valued network customer you have...     spam     1
9  Had your mobile 11 months or more? U R entitle...     spam     1


In [73]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [75]:
def preprocess_text(text):
    if not isinstance(text,str):
        return ''
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    tokens= word_tokenize(text)
    tokens=[word for word in tokens if word not in stopwords.words('english')]
    return ''.join(tokens)
df['combined_text'] = df['Message'].fillna('') + " " + df['Category'].fillna('')   
df['cleaned_text'] = df['combined_text'].apply(preprocess_text)
print(df[['Message','Category','cleaned_text']].head(10))

                                             Message Category  \
0  Go until jurong point, crazy.. Available only ...      ham   
1                      Ok lar... Joking wif u oni...      ham   
2  Free entry in 2 a wkly comp to win FA Cup fina...     spam   
3  U dun say so early hor... U c already then say...      ham   
4  Nah I don't think he goes to usf, he lives aro...      ham   
5  FreeMsg Hey there darling it's been 3 week's n...     spam   
6  Even my brother is not like to speak with me. ...      ham   
7  As per your request 'Melle Melle (Oru Minnamin...      ham   
8  WINNER!! As a valued network customer you have...     spam   
9  Had your mobile 11 months or more? U R entitle...     spam   

                                        cleaned_text  
0  gojurongpointcrazyavailablebugisngreatworldlae...  
1                              oklarjokingwifuoniham  
2  freeentry2wklycompwinfacupfinaltkts21stmay2005...  
3                     udunsayearlyhorucalreadysayham  
4        

In [77]:
x = df['cleaned_text']
y = df['spam']


In [78]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [83]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [84]:
print(f"Training set size:{len(x_train)}")
print(f"Test set size:{len(x_test)}")

Training set size:4457
Test set size:1115


In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [87]:
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [88]:
from sklearn.linear_model import LogisticRegression

In [89]:
lr= LogisticRegression()

In [90]:
lr.fit(x_train_tfidf,y_train)

In [93]:
print(f"Training Accuracy: {lr.score(x_train_tfidf, y_train)}")

Training Accuracy: 0.9084586044424501


In [94]:
from sklearn.metrics import classification_report, confusion_matrix

In [96]:
y_pred = lr.predict(x_test_tfidf)

In [97]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1013
           1       0.00      0.00      0.00       102

    accuracy                           0.91      1115
   macro avg       0.45      0.50      0.48      1115
weighted avg       0.83      0.91      0.86      1115

[[1013    0]
 [ 102    0]]


In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,  confusion_matrix

In [101]:
rf_model = RandomForestClassifier()
rf_model.fit(x_train_tfidf, y_train)

In [102]:
y_pred_rf = rf_model.predict(x_test_tfidf)

In [107]:
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy Score:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred))

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96      1013
           1       1.00      0.24      0.38       102

    accuracy                           0.93      1115
   macro avg       0.96      0.62      0.67      1115
weighted avg       0.94      0.93      0.91      1115

Accuracy Score: 0.9300448430493273
[[1013    0]
 [ 102    0]]


In [108]:
import xgboost as xgb

In [109]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_train_tfidf, y_train)

In [106]:
y_pred_xgb = xgb_model.predict(x_test_tfidf)

In [110]:
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Accuracy Score:", accuracy_score(y_test, y_pred_xgb))
print(confusion_matrix(y_test, y_pred))

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95      1013
           1       0.00      0.00      0.00       102

    accuracy                           0.91      1115
   macro avg       0.45      0.50      0.48      1115
weighted avg       0.83      0.91      0.86      1115

Accuracy Score: 0.9085201793721973
[[1013    0]
 [ 102    0]]


In [111]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [112]:
clf = SVC(kernel='rbf', gamma='scale',random_state=42)

In [114]:
clf.fit(x_train_tfidf, y_train)

In [115]:
y_pred = clf.predict(x_test_tfidf)

In [116]:
accuracy = accuracy_score(y_test, y_pred)

In [117]:
print("Accuracy:",accuracy)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9130044843049328
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1013
           1       1.00      0.05      0.09       102

    accuracy                           0.91      1115
   macro avg       0.96      0.52      0.52      1115
weighted avg       0.92      0.91      0.88      1115

[[1013    0]
 [  97    5]]


In [118]:
import joblib

In [119]:
joblib.dump(rf_model,'best_rf_model.pkl')

['best_rf_model.pkl']

In [120]:
loaded_model=joblib.load('best_rf_model.pkl')