In [None]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/Sanjay-dev-ds/spam_ham_email_detector/master/spam.csv")
df.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.iloc[0,1][0:]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
df.Label.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [None]:
df['spam'] = df['Label'].apply(lambda x: 1 if x == 'spam' else 0)

In [None]:
df.head()

Unnamed: 0,Label,EmailText,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(df.EmailText, df.Label, test_size=0.2)


In [None]:
X_train.head()

Unnamed: 0,EmailText
5284,Sent me ur email id soon
2054,Y so late but i need to go n get da laptop...
3999,This is the 2nd time we have tried to contact ...
1603,Have you heard about that job? I'm going to th...
4317,S...i will take mokka players only:)


In [None]:
X_test.head()


Unnamed: 0,EmailText
3738,Did you stitch his trouser
2883,Cbe is really good nowadays:)lot of shop and s...
2587,"Aight, see you in a bit"
1531,Oh dang! I didn't mean o send that to you! Lol!
1778,7 wonders in My WORLD 7th You 6th Ur style 5th...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv, y_train)

### **Checking Of Model Accuracy**

In [None]:
X_test_cv = v.transform(X_test)
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       964
        spam       0.96      0.87      0.91       151

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



### **Testing on Sample Emails whether it is Spam or Not Spam**

In [None]:
mails = ["upto 20 % discount on parking, exclusive offer just for you. Dont miss this reward!",
          "Hey mohan, can we get together to watch footbal game tomorrow?",
               ]
mails_count = v.transform(mails)
model.predict(mails_count)

array(['spam', 'ham'], dtype='<U4')

In [None]:
for mail in mails:
  is_spam = model.predict(v.transform([mail]))
  print(mail + " : " + str(is_spam))

upto 20 % discount on parking, exclusive offer just for you. Dont miss this reward! : ['spam']
Hey mohan, can we get together to watch footbal game tomorrow? : ['ham']


## **Same Procedure using Sklearn Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       964
        spam       0.96      0.87      0.91       151

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



# **Exercise On Bag Of Words**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.shape

(50000, 2)

In [None]:
df["Category"] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df.head()

Unnamed: 0,review,sentiment,Category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [None]:
#check the distribution of 'Category' and see whether the Target labels are balanced or not.
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
1,25000
0,25000


In [None]:
X_train,X_test,y_train,y_test = train_test_split(df.review, df.Category, test_size=0.2)
X_train.shape

(40000,)

In [None]:
# creation of pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())

])

In [None]:
# fit with x_train and y_train
clf.fit(X_train, y_train)

In [None]:
# get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84      4961
           1       0.86      0.81      0.83      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



# **Using Random Forest as the classifier**

In [None]:
# use Random Forest as the classifier with estimators as 50 and criterion as entropy.
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('rf', RandomForestClassifier(n_estimators=50 , criterion='entropy'))
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.84      0.83      4961
           1       0.84      0.83      0.84      5039

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



## **Using KNN AS Classifier**

In [None]:
clf = Pipeline([
    ('vectorizer', CountVectorizer() ),
    ('knn',KNeighborsClassifier(n_neighbors=10))
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.64      0.67      0.66      4961
           1       0.66      0.63      0.65      5039

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000

