In [108]:
import numpy as np
import pandas as pd

In [109]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')

In [110]:
df.sample(5)

In [111]:
df.shape

# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building
# 5. Evaluation
# 6. Improvement
# 7. Website
# 8. Deploy

1. DATA cleaning

In [112]:
df.info()

In [113]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [114]:
df.sample(5)

In [115]:
# renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [116]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [117]:
df['target'] = encoder.fit_transform(df['target'])

In [118]:
df.head()

In [119]:
# missing values
df.isnull().sum()

In [120]:
# check for duplicate values
df.duplicated().sum()

In [121]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [122]:
df.duplicated().sum()

In [123]:
df.shape

2. EDA

In [124]:
df.head()

In [125]:
df['target'].value_counts()

In [126]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [127]:
%pip install nltk
import nltk

In [128]:
nltk.download('punkt')

In [129]:
df['num_characters'] = df['text'].apply(len)

In [130]:
df.head()

In [131]:
# num of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [132]:
df.head()

In [133]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [134]:
df.head()

In [135]:
# ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [136]:
#spam
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [137]:
import seaborn as sns

In [138]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [139]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

3. DATA PREPROCESSING

In [140]:
from nltk.corpus import stopwords
stopwords.words('english')

import string 
string.punctuation

In [141]:
# stemming- convert into original word
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [142]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    text = y[:] #clone the y to text
    y.clear()
   
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)

In [143]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [144]:
df['text'][10]

In [145]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

In [146]:
%pip install wordcloud

In [147]:
from wordcloud import WordCloud
wc = WordCloud(width = 500,height = 600, min_font_size=10,background_color='white')

In [148]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [149]:
plt.figure(figsize=(12,6))
plt.imshow(spam_wc)

In [150]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [151]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [152]:
df.head()

In [153]:
spam_corpus = []
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
        

In [154]:
len(spam_corpus)

4. MODEL BUILDING

In [155]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [156]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [157]:
X.shape

In [158]:
y = df['target'].values

In [159]:
from sklearn.model_selection import train_test_split

In [160]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,ranom_state=2)

In [None]:
print(X_train)
print(y_train)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
mnb = MultinomialNB()

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier



In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)#linear,polynomial--gamma = large more complext & flexible
knc = KNeighborsClassifier()
mnb = MultinomialNB()
rfc = RandomForestClassifier(n_estimators=50) 

In [None]:
clfs = {
    'KN':knc,
    'SVC':svc,
    'NB':mnb,
    'RF':rfc
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:
performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.show()

In [None]:
# model improve


In [None]:
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0)
mnb = MultinomialNB()

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('svc', svc), ('nb', mnb), ],voting='soft')

In [None]:
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer2.pkl'))
pickle.dump(rfc,open('model2.pkl'))