In [1]:
import pandas as pd

In [2]:
# Download the dataset

import os
import wget

dataset_url = 'https://github.com/GopalSaraf/Practicals/releases/download/ML-Datasets/SMSSpamCollection'

if not os.path.exists('SMSSpamCollection'):
    wget.download(dataset_url)

In [3]:
df = pd.read_csv('SMSSpamCollection', sep="\t", header=None, names=["label", "sms"])

# Convert the labels to binary values
# 0 for ham and 1 for spam
df["label"] = df.label.map({"ham": 0, "spam": 1})

In [4]:
df

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [5]:
# Check the distribution of the labels
df.label.value_counts()

label
0    4825
1     747
Name: count, dtype: int64

In [6]:
# Split the dataset into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df["sms"], df["label"], test_size=0.2)

In [7]:
# Create a vectorizer to convert the text to a vector of numbers

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Learn the vocabulary of the training data
vectorizer.fit(X_train)

In [8]:
# Transform the training data
X_train = vectorizer.transform(X_train)

# Transform the testing data
X_test = vectorizer.transform(X_test)

In [9]:
print(X_train.shape)
print(X_test.shape)

(4457, 7705)
(1115, 7705)


In [10]:
# Train the model

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train, y_train)

In [11]:
# Evaluate the model

from sklearn.metrics import accuracy_score, confusion_matrix

y_pred = model.predict(X_test)

In [12]:
# Get the accuracy
accuracy_score(y_test, y_pred)

0.9901345291479821

In [13]:
# Confusion matrix

confusion_matrix(y_test, y_pred)

array([[974,   2],
       [  9, 130]])

In [14]:
# Predict on new data

sms = [
    "Congratulations! You have won Rs. 1,00,000. Please call 9999999999 to claim.",
    "Hi, how are you?",
    "Hi, what are you doing?"
]

sms = vectorizer.transform(sms)

model.predict(sms)

array([1, 0, 0])