In [38]:
"""
    IMPORT THE NECCESSARY LIBRARIES
"""
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
"""
    Importing the dataset
"""

DATA = pd.read_csv('/content/SMSSpamCollection', sep='\t', names=["label", "message"])

In [40]:
"""
    UTILITY TO CHECK CORPUS
"""
def print_arr(arr, num):
  for i in range(num):
    print("Line Number->",i," ->   ",arr[i], "\n")

In [52]:
print(DATA)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [53]:
"""
    Data cleaning and preprocessing
"""
lemmatizer = WordNetLemmatizer()

def clean_and_preproccess_the_corpus(data):
  corpus = []
  for i in range(len(data)):

      new = re.sub('[^a-zA-Z]',' ',data["message"][i]) # Keep only ascii alphabets
      new = new.lower() # Lowercasing the corpus
      new = new.split() # Splitting the paragraph into sentences
      new = [lemmatizer.lemmatize(word) for word in new if not word in set(stopwords.words('english'))] #Lemmatizing the word if it is not a stopword else rejecting it
      new = ' '.join(new)
      corpus.append(new)
  print_arr(corpus, 10)
  return corpus

In [54]:
corpus = clean_and_preproccess_the_corpus(DATA)

Line Number-> 0  ->    go jurong point crazy available bugis n great world la e buffet cine got amore wat 

Line Number-> 1  ->    ok lar joking wif u oni 

Line Number-> 2  ->    free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply 

Line Number-> 3  ->    u dun say early hor u c already say 

Line Number-> 4  ->    nah think go usf life around though 

Line Number-> 5  ->    freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv 

Line Number-> 6  ->    even brother like speak treat like aid patent 

Line Number-> 7  ->    per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune 

Line Number-> 8  ->    winner valued network customer selected receivea prize reward claim call claim code kl valid hour 

Line Number-> 9  ->    mobile month u r entitled update latest colour mobile camera free call mobile update co free 



In [56]:
"""
   Creating the Bag of words model
   For TF-IDF use -> TfidfVectorizer
"""

cv = TfidfVectorizer(max_features=4000) # 4000 is the maximum number of features our model can have
X = cv.fit_transform(corpus).toarray()

"""
  Conerting HAM and SPAM to 0 and 1 encodings
"""
y=pd.get_dummies(DATA['label'])
y=y.iloc[:,1].values


In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [58]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [59]:
y_pred=spam_detect_model.predict(X_test)

In [60]:
y_pred

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [62]:
"""
  Getting the accuracy of my model
"""

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred,y_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

print("Accuracy of our model is :",accuracy)

Accuracy of our model is : 0.9775784753363229


In [64]:
"""
    Making an example and testing it out as well
"""

# Preprocess the input sentence
input_sentence = "Hello customer, get 10,000$ discount for free, click on this link www.ikshan.com"
input2 = "Hello how are you bro?"
input3 = "Can we meet someday later?"
input4 = "your OTP is 4356, do not share this with anyone"
preprocessed_sentence = re.sub('[^a-zA-Z]', ' ', input4)  # Remove non-alphabetic characters
preprocessed_sentence = preprocessed_sentence.lower()  # Convert to lowercase
preprocessed_sentence = preprocessed_sentence.split()  # Tokenize into words
preprocessed_sentence = [lemmatizer.lemmatize(word) for word in preprocessed_sentence if word not in set(stopwords.words('english'))]  # Remove stopwords and lemmatize
preprocessed_sentence = ' '.join(preprocessed_sentence)

# Convert the preprocessed sentence to TF-IDF representation
sentence_tfidf = cv.transform([preprocessed_sentence]).toarray()

# Use the trained model to predict the label
prediction = spam_detect_model.predict(sentence_tfidf)

# Map the prediction to the corresponding label
label = "spam" if prediction[0] == 1 else "ham"

print("The sentence is classified as:", label)

The sentence is classified as: ham
