<a href="https://colab.research.google.com/github/LITHUVARSHNI/Codsoftintern_lithu/blob/main/Spam_SMS_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Build an AI model that can classify SMS messages as spam orlegitimate. Use techniques like TF-IDF or word embeddings withclassifiers like Naive Bayes, Logistic Regression, or Support VectorMachines to identify spam messages**

IMPORTING THE NECESSARY PACKAGES

In [5]:
import numpy as np
import pandas as pd
import io

LOADING THE DATASET

In [6]:
df = pd.read_csv('spam.csv', encoding='latin-1')

In [7]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2633,ham,That's what I love to hear :V see you sundayis...,,,
2000,ham,But i'll b going 2 sch on mon. My sis need 2 t...,,,
1123,ham,Good morning princess! How are you?,,,
1450,ham,Msg me when rajini comes.,,,
5099,ham,"Ah, well that confuses things, doesnt it? I th...",,,


DIMENSION-NUMBER OF ROWS AND COLUMNS

In [8]:
df.shape

(5572, 5)

REMOVING THE COLUMNS

In [9]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)


RENAMING THE COLUMNS

In [10]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
986,ham,I'm in office now . I will call you &lt;#&gt;...
2237,ham,"Give her something to drink, if she takes it a..."
3545,ham,SO IS TH GOWER MATE WHICH IS WHERE I AM!?! HOW...
5114,ham,Argh why the fuck is nobody in town ;_;
2349,ham,Yar else i'll thk of all sorts of funny things.


In [11]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

PREPROCESSING THE DATA

In [12]:
df['target'] = encoder.fit_transform(df['target'])


In [13]:
df = df.drop_duplicates(keep='first')


In [14]:
df['target'].value_counts()


target
0    4516
1     653
Name: count, dtype: int64

In [15]:
import nltk


In [16]:
!pip install nltk




In [17]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

CALCULATION OF NUMBER OF CHARACTERS,WORDS AND SENTENCES

In [18]:
df['num_characters'] = df['text'].apply(len)


In [19]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))


In [20]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))


In [21]:
df[['num_characters','num_words','num_sentences']].describe()



Unnamed: 0,num_characters,num_words,num_sentences
count,5169.0,5169.0,5169.0
mean,78.977945,18.455794,1.965564
std,58.236293,13.324758,1.448541
min,2.0,1.0,1.0
25%,36.0,9.0,1.0
50%,60.0,15.0,1.0
75%,117.0,26.0,2.0
max,910.0,220.0,38.0


In [22]:
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()


Unnamed: 0,num_characters,num_words,num_sentences
count,4516.0,4516.0,4516.0
mean,70.459256,17.123782,1.820195
std,56.358207,13.49397,1.383657
min,2.0,1.0,1.0
25%,34.0,8.0,1.0
50%,52.0,13.0,1.0
75%,90.0,22.0,2.0
max,910.0,220.0,38.0


In [23]:
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()


Unnamed: 0,num_characters,num_words,num_sentences
count,653.0,653.0,653.0
mean,137.891271,27.667688,2.970904
std,30.137753,7.008418,1.488425
min,13.0,2.0,1.0
25%,132.0,25.0,2.0
50%,149.0,29.0,3.0
75%,157.0,32.0,4.0
max,224.0,46.0,9.0


In [24]:
import string
ps = nltk.stem.PorterStemmer()

TF-IDF VECTORIZATION

In [25]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))


    return " ".join(y)

In [26]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")


'gon na home soon want talk stuff anymor tonight k cri enough today'

In [27]:
df['text'][10]

"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [28]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

'love'

In [29]:
df['transformed_text'] = df['text'].apply(transform_text)

In [30]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [31]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [33]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [34]:
y = df['target'].values

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)


In [37]:
from sklearn.naive_bayes import MultinomialNB

In [38]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [39]:
mnb = MultinomialNB()

In [40]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9709864603481625
[[896   0]
 [ 30 108]]
1.0


In [41]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))