<a href="https://colab.research.google.com/github/Lakshitalearning/SpamFortress/blob/main/Code_SMS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMPORTING LIBRARIES AND DATASET**

In [None]:
#importing libraries
!pip install nltk
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print(np.__version__)
print(pd.__version__)
print(sns.__version__)

1.25.2
2.0.3
0.13.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#reading data from csv file
dataset = pd.read_csv('spam.csv',encoding='latin-1')

FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [None]:
dataset.head()

# **DATA CLEANING**

In [None]:
#To get the knowledge of null and not null values present in data
dataset.info()

In [None]:
#since last 3 column contains many null values hence we can drop
dataset.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
dataset.head()

In [None]:
#renaming column v1 and v2
dataset.rename(columns={'v1':'target','v2':'data'},inplace=True)

In [None]:
#checking for missing value if there in dependent and independent variable
dataset.isnull().sum()

In [None]:
#checking for duplicate values
dataset.duplicated().sum()

In [None]:
#removing duplicated values
dataset.drop_duplicates(keep='first',inplace=True)
dataset.head()

In [None]:
#to use 1 and 0 instead of spam and not spam respectively using label encoder
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
dataset['target']=le.fit_transform(dataset['target'])   #dependent variable

# **Exploratory data analysis**

In [None]:
dataset.info()

In [None]:
#plot of percentage of spam and ham messages in data
import matplotlib.pyplot as plt
plt.pie(dataset['target'].value_counts(),labels=['spam','ham'],autopct='%0.2f')
plt.show()

In [None]:
# to get number of characters, words and sentences from text
dataset['total_characters']=dataset['data'].apply(len)
dataset['total_sentences']=dataset['data'].apply(lambda x:len(nltk.sent_tokenize(x)))
dataset['total_words']=dataset['data'].apply(lambda x:len(nltk.word_tokenize(x)))

dataset.head()

In [None]:
#to describe 3 columns
dataset[['total_characters','total_words','total_sentences']].describe()

In [None]:
# describing about ham(not spam)
dataset[dataset['target']==0][['total_characters','total_words','total_sentences']].describe()

In [None]:
# describing about spam( spam)
dataset[dataset['target']==1][['total_characters','total_words','total_sentences']].describe()

In [None]:
#constructing histograms for comparing
import seaborn as sns
#comparing characters
plt.figure(figsize=(12,6))
sns.histplot(dataset[dataset['target']==0]['total_characters'],color='blue')
sns.histplot(dataset[dataset['target']==1]['total_characters'],color='red')


In [None]:
sns.pairplot(dataset,hue='target')

# **DATA PRE-PROCESSING**

In [None]:
#how to use vectorisation over here

#lower case, tokenisation, removing special characters, removing stop words and punctuations , stemming
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

#defining function alag alag

def transform_data(data):
   #lower case
    data=data.lower()
    #words
    data=nltk.word_tokenize(data)
    y=[]
    for i in data:
       if i.isalnum():
        y.append(i)
    data=y[:]
    y.clear()
    #checking for stopwords and punctuations
    for i in data:
      if i not in stopwords.words('english') and i not in string.punctuation:
        y.append(i)
    #stemming
    data=y[:]
    y.clear()
    ps=nltk.PorterStemmer()
    for i in data:
       y.append(ps.stem(i))
    return " ".join(y)

dataset['transformed_data']=dataset['data'].apply(transform_data)

dataset.head()

In [None]:
#word cloud of spam
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc=wc.generate(dataset[dataset['target']==1]['transformed_data'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
#word cloud for ham
ham_wc=wc.generate(dataset[dataset['target']==0]['transformed_data'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)


In [None]:
#storing in  list(text)
spam_corpus=[]
for msg in dataset[dataset['target']==1]['transformed_data'].tolist():
  for word in msg.split():
    spam_corpus.append(word)


In [None]:
len(spam_corpus)

In [None]:
#top used words spam
from collections import Counter
spam_corpus_dataset = pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['Word', 'Frequency'])
sns.barplot(x='Word', y='Frequency', data=spam_corpus_dataset)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#top used words ham
#storing in  list(text)
ham_corpus=[]
for msg in dataset[dataset['target']==0]['transformed_data'].tolist():
  for word in msg.split():
    ham_corpus.append(word)

from collections import Counter
ham_corpus_dataset = pd.DataFrame(Counter(ham_corpus).most_common(30), columns=['Word', 'Frequency'])
sns.barplot(x='Word', y='Frequency', data=ham_corpus_dataset)
plt.xticks(rotation='vertical')
plt.show()

# **MODEL BUILD**

In [None]:
#to work on ml model we need numerical data[text to vector -> bagofwords, tfidf,wordstovec ]
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv=CountVectorizer()
tfidf=TfidfVectorizer(max_features=3000)
X=tfidf.fit_transform(dataset['transformed_data']).toarray()

In [None]:
y=dataset['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
#giving max precision  when tfidf
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc=SVC(kernel='sigmoid',gamma=1.0)
knc=KNeighborsClassifier()
mnb=MultinomialNB()
dtc=DecisionTreeClassifier(max_depth=5)
lrc=LogisticRegression(solver='liblinear',penalty='l1')
rfc=RandomForestClassifier(n_estimators=50,random_state=2)
abc=AdaBoostClassifier(n_estimators=50,random_state=2)
bc=BaggingClassifier(n_estimators=50,random_state=2)
etc=ExtraTreesClassifier(n_estimators=50,random_state=2)
gbdt=GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb=XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs={
    'SVC':svc,
    'KNN':knc,
    'MNB':mnb,
    'DT': dtc,
    'LR':lrc,
    'RFC':rfc,
    'ABC':abc,
    'BC':bc,
    'ETC':etc,
    'GBDT':gbdt,
    'XGB':xgb

}

In [None]:
def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    precision=precision_score(y_test,y_pred)
    return accuracy,precision

In [None]:
accuracy_scores=[]
precision_scores=[]

for name,clf in clfs.items():
  current_accuracy,current_precision=train_classifier(clf,X_train,y_train,X_test,y_test)
  print("For ",name)
  print("Accuracy - ",current_accuracy)
  print("Precision - ",current_precision)
  accuracy_scores.append(current_accuracy)
  precision_scores.append(current_precision)

In [None]:
performance_dataset=pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_dataset

# **Model Improvement **

In [None]:
# changing vectorisation method
# changing their max features
# appending any column like character , alphabet
#     X=np.hstack((X, df['num_characters'].values.reshape [-1,1]))
#     y=df['target].values
# using voting classifier
# using stacking

In [None]:
# #voting classifier
# from sklearn.ensemble import VotingClassifier
# svc=SVC(kernel='sigmoid',gamma=1.0,probability=True)
# mnb=MultinomialNB()
# etc=ExtraTreesClassifier(n_estimators=50,random_state=2)
# voting=VotingClassifier(estimators=[('svm',svc),('nb',mnb),('et',etc)],voting='soft')
# voting.fit(X_train,y_train)
# y_pred=voting.predict(X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))

In [None]:
# #appling stacking
# estimators=[('svm',svc),('nb',mnb),('et',etc)]
# final_estimator=RandomForestClassifier()
# from sklearn.ensemble import StackingClassifier
# clf=StackingClassifier(estimators=estimators,final_estimator=final_estimator)
# clf.fit(X_train,y_train)
# y_pred=clf.predict(X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))
with open('trained_model.pkl', 'wb') as f:
    pickle.dump(mnb, f)