In [24]:
import numpy as np
import pandas as pd

In [25]:
df=pd.read_csv('mail_data.csv', encoding="cp1252")

In [26]:
df.sample(5)

Unnamed: 0,Category,Message
496,ham,Got meh... When?
4267,ham,The greatest test of courage on earth is to be...
1431,ham,Don't look back at the building because you ha...
4909,ham,"I'm in solihull, | do you want anything?"
4395,ham,Baaaaaaaabe! Wake up ! I miss you ! I crave yo...


In [27]:
df.shape


(5572, 2)

In [28]:
#  1. Data cleaning
#  2. EDA
#  3. Text Preprocessing
#  4. Model Building
#  5. Evaluation
#  6. Improvements
#  7. website
#  8. Deploy

# 1. Data Cleaning


In [29]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [30]:
# drop last 3 cols
df.drop (columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

# inplace bcz its an permanent operation

KeyError: "['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'] not found in axis"

In [None]:
df.sample(5)

In [None]:
# renaming the cols
df.rename(columns={'v1': 'target','v2':'text'},inplace=True)

In [None]:
df.sample(5)

In [None]:
# applying label encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values 
df.isnull().sum()

In [None]:
# check for duplicate values .
df.duplicated().sum()

In [None]:
# remove duplicates
df= df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape


# 2. EDA (Exploratery Data Analysis)

In [None]:
# checking what amount of ham or spam data is present in dataset
# Understanding of data

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
# so we can see ham is large //0-ham,1-spam

In [None]:
# reprsenting in pie chart 

import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
# Data is imbalanced

In [None]:
import nltk

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
# for deeper analysis creating 3 new cols 
# 1. no of characters in the sms,
# 2. no of words in the sms ,
# 3. no of sentences in the sms
# thats why we using nltk(natural language toolkit) library.

In [None]:
# for no of char

df['num_characters']=df['text'].apply(len)


In [None]:
df.head()

In [None]:
import nltk
nltk.download('punkt_tab')


In [None]:
# no of words
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
# So cols are created ,,already existing features are created as cols we named it


In [None]:
# so we use  describe operation on this 3 cols to see what's going on

df[['num_characters','num_words','num_sentences']].describe()

In [None]:

# so analyzing ham and spam separately


# for ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# for spam

df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()


In [None]:
# so we can see that for mean spam msgs are bigger than to ham msg.


In [None]:
# lets see on the plot (Histogram)


In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(12,8))

# for ham

sns.histplot(df[df['target'] == 0]['num_characters'])

# for spam

sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,8))

# for ham

sns.histplot(df[df['target'] == 0]['num_words'])

# for spam

sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [None]:
# we can see that mostly spam msgs  are made up of more words

In [None]:
# lets see how are  the relationship among 3 cols

In [None]:
sns.pairplot(df,hue='target')

In [None]:
# checking corelation coeffiecient
df.corr(numeric_only=True)

In [None]:
# for better understanding lets plot an heat map
sns.heatmap(df.corr(numeric_only=True),annot=True)

# 3. Data Preprocessing

  1. Lower case
  2. Tokenization
  3. Removing special characters
  4. Removing stop words and punctuation
  5. Stemming

In [None]:
# performing all above properties

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text :
        if i.isalnum():
            y.append(i)
            
     # for stop words and puctuation
    
    
    text = y[:]
    y.clear()
    
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
            
     
    # for stemming
    
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
            
    return " ".join(y) 

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
import nltk
nltk.download('all')

In [None]:
import string
string.punctuation

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('dancing')

In [None]:
transform_text(" I loved the YT Lectures on Machine Learning . How about you?") 

In [None]:
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
!pip3 install wordcloud


In [None]:
# generating a wordcloud bcz its show important spam or ham words larger

In [None]:
!pip install --user WordCloud


In [None]:
import sys
print(sys.executable)

In [None]:
pip install some_package

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(12,6)) #aur bada dekhne ke liye for (SPAM)
plt.imshow(spam_wc)

In [None]:
# for ham

ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))



In [None]:
plt.figure(figsize=(12,6)) 
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
# top 30 words that use for spam

spam_corpus = []
for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
    

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
Counter(spam_corpus).most_common(30)

In [None]:
spam_words = pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['word', 'count'])

# Create the bar plot
sns.barplot(x='word', y='count', data=spam_words)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# top 30 words that use for ham

ham_corpus = []
for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)
    

In [None]:
len(ham_corpus)

In [None]:
ham_words = pd.DataFrame(Counter(ham_corpus).most_common(30), columns=['word', 'count'])

# Create the bar plot
sns.barplot(x='word', y='count', data=ham_words)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Text Vectorization
# using Bag of Words
df.head()

# 4. Model Building

In [None]:
# we use naive bayes algo

# process - . data vactorize using bag of words
             #  then as input give to naive bayes,
             #then checking accuracy for performance identify
            

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray()

In [None]:
X.shape

In [None]:
y = df['target'].values

In [None]:
y

In [None]:
pip install -U scikit-learn

In [None]:
import sklearn
print (sklearn.__version__)


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:

gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:

gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))


In [None]:

mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:

bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
# tfidf --->>>MNB     # same thing maine niche ache se kiya hai

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)


In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    
}

In [None]:

def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)

In [None]:
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

In [None]:
performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert DataFrame into long format for Seaborn
performance_melted = performance_df.melt(id_vars="Algorithm", var_name="Metric", value_name="Value")

# Plot
sns.catplot(x="Algorithm", y="Value", hue="Metric", data=performance_melted, kind="bar", height=5)
plt.ylim(0.5, 1.0)
plt.xticks(rotation=90)  # Rotate labels for readability
plt.title("Performance of Different Algorithms")
plt.show()


In [None]:
# model improve
# 1. Change the max_features parameter of TfIdf

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precision_scores}).sort_values('Precision_scaling',ascending=False)

In [None]:
new_df = performance_df.merge(temp_df,on='Algorithm')

In [None]:
new_df_scaled = new_df.merge(temp_df,on='Algorithm')

In [None]:
temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':precision_scores}).sort_values('Precision_num_chars',ascending=False)

In [None]:
new_df_scaled.merge(temp_df,on='Algorithm')

In [None]:
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

In [None]:
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
# importing vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(max_features=3500)

# create vectors of transformed text feature and then stored it in x variable
x = tf_idf.fit_transform(df["transformed_text"]).toarray()
y = df["target"].values

# now try to train MultinomialNB(), it will work
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

In [None]:
import pickle
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))