In [None]:
import numpy as np
import pandas as pd
!pip install chardet
import chardet
import matplotlib.pyplot as plt

In [None]:
with open('/kaggle/input/sms-spam-collection-dataset/spam.csv', 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

In [None]:
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding=encoding)

In [None]:
df.head()

In [None]:
df.isnull().sum()

### Data Cleaning

In [None]:
df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [None]:
df.rename({'v1':'target','v2':'text'},axis=1,inplace=True)

In [None]:
df['target']=df['target'].map({'ham':0,'spam':1})

In [None]:
#checking for duplicate values
df.duplicated().sum()

In [None]:
df.drop_duplicates(keep='first',inplace=True)

In [None]:
df.head()

### EDA

In [None]:
y=[df['target'].sum(),df.shape[0]-df['target'].sum()]
plt.pie(y,labels=['spam','ham'],autopct='%1.2f%%')
plt.show()

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_char']=df['text'].apply(len)

In [None]:
df['num_word']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df['num_sen']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
#for ham
df[df.target==0][['num_char','num_word','num_sen']].describe()

In [None]:
#for spam
df[df.target==1][['num_char','num_word','num_sen']].describe()
#mean is more for spam messages

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(df[df.target==0]['num_char'],label='ham')
sns.histplot(df[df.target==1]['num_char'],color='red',label='spam')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,5))
sns.histplot(df[df.target==0]['num_word'],label='ham')
sns.histplot(df[df.target==1]['num_word'],color='red',label='spam')
plt.legend()
plt.show()

In [None]:
sns.heatmap(df.select_dtypes(include='int').corr(),annot=True)

since char has most correlation with target, we choose num_char for further use

### Text Processing
-Lower Case <br>
-Tokenization <br>
-Removing Special Chars<br>
-Removing stop words and punctuation<br>
-Stemming<br>

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
import string

In [None]:
def transform_text(text):
    text=text.lower()
    text=nltk.word_tokenize(text)
    
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text.clear()        
    for i in y:
        if i not in stopwords.words('english') and i not in string.punctuation:
            text.append(i)
            
    y.clear()
    ps=PorterStemmer()
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc=wc.generate(df[df.target==1]['transformed_text'].str.cat(sep=' '))
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df.target==0]['transformed_text'].str.cat(sep=' '))
plt.figure(figsize=(12,6))
plt.imshow(ham_wc)

In [None]:
#most ocurring words in spam
y={}
for msg in df[df.target==1].transformed_text.tolist():
    for word in msg.split():
        y[word]=y.get(word,0)+1

sorted_y=sorted(y.items(),key=lambda x:x[1],reverse=True)
top_words = [item[0] for item in sorted_y[:30]]
word_counts = [item[1] for item in sorted_y[:30]]
sns.barplot(x=top_words,y=word_counts)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#most ocurring words in spam
y={}
for msg in df[df.target==0].transformed_text.tolist():
    for word in msg.split():
        y[word]=y.get(word,0)+1

sorted_y=sorted(y.items(),key=lambda x:x[1],reverse=True)
top_words = [item[0] for item in sorted_y[:30]]
word_counts = [item[1] for item in sorted_y[:30]]
sns.barplot(x=top_words,y=word_counts)
plt.xticks(rotation='vertical')
plt.show()

### Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#cv = CountVectorizer()  #bag of words
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
#X=cv.fit_transform(df['transformed_text']).toarray()   #returns a sparse matrix
tfidf.fit(df['transformed_text'])
X=tfidf.transform(df['transformed_text']).toarray()

In [None]:
y=df['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

Since, precision maters the most here (false positive), we go with mnb

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'MNB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values(by='Precision',ascending=False)

In [None]:
performance_df

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

A Voting Classifier is an ensemble learning method that combines the predictions of multiple base estimators (machine learning models) and predicts the class label by taking a vote.

In [None]:
voting = VotingClassifier(estimators=[('svm', svc), ('mnb', mnb), ('etc', etc)],voting='soft')
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

A stacking classifier is an ensemble method where the output from multiple classifiers is passed as an input to a meta-classifier for the task of the final classification.

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
#best performance is observed in Multinomial-Naive Bayes
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))