# SPAM SMS DETECTION

 Build an AI model that can classify SMS messages as spam or
 legitimate. Use techniques like TF-IDF or word embeddings with
 classifiers like Naive Bayes, Logistic Regression, or Support Vector
 Machines to identify spam messages

In [11]:
## Importing libraries and models

import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score
from sklearn.model_selection import train_test_split 

In [12]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1', usecols=[0, 1], names=['label', 'message'])

In [13]:
df.head()

Unnamed: 0,label,message
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [14]:
df.drop(0,inplace=True)

In [15]:
df.head()

Unnamed: 0,label,message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
5,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
X=df.message

In [17]:
## spam=0,ham=1
y = np.where(df['label'].str.contains('spam'), 0, 1)

In [18]:
## spliting the data

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3,random_state=42)

In [19]:
## Data encoding
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [20]:
## Model training and metric evaluation


models={
    "Logistic-Regression" : LogisticRegression(),
    "Naive-Bayes" : MultinomialNB(),
    "Support-Vector-Machine" : SVC(kernel="linear")
}


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    
    
    ## Make Prediction
    y_predict_train=model.predict(X_train)
    y_predict_test=model.predict(X_test)
    
    
    ## Train set performance
    y_predict_train_accuracy=accuracy_score(y_train,y_predict_train)
    y_predict_train_f1_score=f1_score(y_train,y_predict_train,average='weighted',zero_division=0)
    y_predict_train_precision=precision_score(y_train,y_predict_train,average='weighted',zero_division=0)
    y_predict_train_recall=recall_score(y_train,y_predict_train,average='weighted',zero_division=0)
    
    ## Test set performance
    y_predict_test_accuracy=accuracy_score(y_test,y_predict_test)
    y_predict_test_f1_score=f1_score(y_test,y_predict_test,average='weighted')
    y_predict_test_precision=precision_score(y_test,y_predict_test,average='weighted')
    y_predict_test_recall=recall_score(y_test,y_predict_test,average='weighted')
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(y_predict_train_accuracy))
    print('- F1 score: {:.4f}'.format(y_predict_train_f1_score))
    
    print('- Precision: {:.4f}'.format(y_predict_train_precision))
    print('- Recall: {:.4f}'.format(y_predict_train_recall))

    
    
    print('----------------------------------')
    
    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(y_predict_test_accuracy))
    print('- F1 score: {:.4f}'.format(y_predict_test_f1_score))
    print('- Precision: {:.4f}'.format(y_predict_test_precision))
    print('- Recall: {:.4f}'.format(y_predict_test_recall))

    
    print('='*35)
    print('\n')
    
    


Logistic-Regression
Model performance for Training set
- Accuracy: 0.9718
- F1 score: 0.9704
- Precision: 0.9725
- Recall: 0.9718
----------------------------------
Model performance for Test set
- Accuracy: 0.9551
- F1 score: 0.9515
- Precision: 0.9563
- Recall: 0.9551


Naive-Bayes
Model performance for Training set
- Accuracy: 0.9851
- F1 score: 0.9848
- Precision: 0.9854
- Recall: 0.9851
----------------------------------
Model performance for Test set
- Accuracy: 0.9713
- F1 score: 0.9698
- Precision: 0.9722
- Recall: 0.9713


Support-Vector-Machine
Model performance for Training set
- Accuracy: 0.9972
- F1 score: 0.9972
- Precision: 0.9972
- Recall: 0.9972
----------------------------------
Model performance for Test set
- Accuracy: 0.9797
- F1 score: 0.9791
- Precision: 0.9797
- Recall: 0.9797


