In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter

In [None]:
data = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')
data.head()

In [None]:
data.shape

**Data Cleaning**

In [None]:
data.isnull().sum()

In [None]:
data.dropna()

In [None]:
data.info()

In [None]:
data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True)
data

In [None]:
data.shape

In [None]:
data.rename(columns = {"v1": "Target", "v2":"Message"}, inplace = True)
data.head(7)

In [None]:
data.rename(columns = {"v1": "Target", "v2":"Message"}, inplace = True)
encoder = LabelEncoder()
data['Target'] = encoder.fit_transform(data['Target'])
data

Note: 0=ham   1=spam

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(keep='first', inplace= True)

In [None]:
data.drop_duplicates(keep='first', inplace= True)

In [None]:
explode = [0, 0.5]
mycolors = ["green", "hotpink"]
plt.pie(data['Target'].value_counts(), labels=['ham', 'spam'], autopct="%0.2f", explode=explode, colors = mycolors)
plt.show()


In [None]:
data['num_characters'] = data['Message'].apply(len)

In [None]:
 nltk.download('punkt')
data['num_words'] = data['Message'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
data['num_sentences'] = data['Message'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
data[['num_characters','num_words','num_sentences']].describe()

In [None]:
data[data['Target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
data[data['Target'] == 1][['num_characters','num_words','num_sentences']].describe()

Data Preprocessing :
1/Lower case
2/Tokenization
3/Removing special characters
4/Removing stop words and punctuation
5/Stemming

In [None]:
def transform_text(Message):
    text = Message.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))


    return " ".join(y)

In [None]:
ps = PorterStemmer()

In [None]:
import nltk
nltk.download('stopwords')
data['transformed_text'] = data['Message'].apply(transform_text)

In [None]:
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(data[data['Target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
ham_wc = wc.generate(data[data['Target'] == 0]['transformed_text'].str.cat(sep=" "))

In [None]:
spam_corpus = []
for msg in data[data['Target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
ham_corpus = []
for msg in data[data['Target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

Model Building

In [None]:
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(data['transformed_text']).toarray()

In [None]:
y = data['Target'].values

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)

    return accuracy,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():

    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)

    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
erformance_data = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))