In [1]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv(r"C:\Users\91891\Downloads\SMSSpamCollection1.csv", encoding = 'latin-1')

In [9]:
data = data[['v1','v2']]
data = data.rename(columns = {'v1':'label', 'v2':'message'})

In [10]:
data.shape

(5572, 2)

In [11]:
data.isna().sum()

label      0
message    0
dtype: int64

In [14]:
data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [16]:
data['label'].value_counts()   # This is im-balance dataset

ham     4825
spam     747
Name: label, dtype: int64

In [17]:
data['label'].value_counts() / (len(data))

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [21]:
ham = data[data['label'] == 'ham']
spam = data[data['label'] == 'spam']

In [22]:
ham.shape , spam.shape

((4825, 2), (747, 2))

In [24]:
ham = ham.sample(spam.shape[0])

In [25]:
ham.shape

(747, 2)

In [55]:
df = ham.append(spam, ignore_index = True)

In [56]:
df   # Balanced Dataset

Unnamed: 0,label,message
0,ham,"Can meh? Thgt some will clash... Really ah, i ..."
1,ham,Ok no problem... Yup i'm going to sch at 4 if ...
2,ham,Hello hun how ru? Its here by the way. Im good...
3,ham,No need for the drug anymore.
4,ham,Are you plans with your family set in stone ?
...,...,...
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [57]:
from sklearn.model_selection import train_test_split

In [119]:
x_train,x_test,y_train,y_test = train_test_split(df['message'],df['label'], test_size = 0.25, random_state = 42)

In [104]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [105]:
cls = Pipeline([("tfidf", TfidfVectorizer()),("classifier", RandomForestClassifier(n_estimators = 100))])

In [106]:
cls.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier())])

In [107]:
y_pred = cls.predict(x_test)

In [109]:
from sklearn.metrics import classification_report , accuracy_score, confusion_matrix

In [110]:
accuracy_score(y_test, y_pred)

0.9438502673796791

In [111]:
confusion_matrix(y_test, y_pred)

array([[178,   2],
       [ 19, 175]], dtype=int64)

In [116]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.90      0.99      0.94       180
        spam       0.99      0.90      0.94       194

    accuracy                           0.94       374
   macro avg       0.95      0.95      0.94       374
weighted avg       0.95      0.94      0.94       374



In [117]:
from sklearn.svm import SVC

In [121]:
svm = Pipeline([("tfidf",TfidfVectorizer()), ("classifier", SVC(C = 100, kernel = 'rbf', gamma = 'auto'))])

In [122]:
svm.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', SVC(C=100, gamma='auto'))])

In [123]:
y_pred = svm.predict(x_test)

In [125]:
accuracy_score(y_test,y_pred)

0.8796791443850267

In [126]:
confusion_matrix(y_test,y_pred)

array([[179,   1],
       [ 44, 150]], dtype=int64)

In [128]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.80      0.99      0.89       180
        spam       0.99      0.77      0.87       194

    accuracy                           0.88       374
   macro avg       0.90      0.88      0.88       374
weighted avg       0.90      0.88      0.88       374



In [142]:
# Testing with our own data

test1 = ['Hello, you are learning NLP']
test2 = ['Congratulations, you won a lottery ticket worth $1 Million ! To claim call on 446677']

In [143]:
print(cls.predict(test1))  # Random Forest Predictions
print(cls.predict(test2))

['ham']
['spam']


In [145]:
print(svm.predict(test1))   # Svm predictions
print(svm.predict(test2))

['ham']
['spam']
