<a href="https://colab.research.google.com/github/HariBejju/ML_DL_AI/blob/main/BagOfWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Your raw data
data = """ham\tI'll call you later, I’m in a meeting now.
spam\tCongratulations! You’ve won a free iPhone. Click here to claim.
ham\tDon’t forget to bring your notebook tomorrow.
spam\tYou have been selected for a cash prize. Reply YES to claim.
ham\tCan we reschedule our lunch to next week?
spam\tGet cheap loans now with 0% interest. Apply today!
ham\tMeeting is confirmed for 3 PM today.
spam\tWin a brand new car! Just send WIN to 55555.
ham\tLet’s catch up this weekend if you're free.
spam\tYour account has been suspended. Verify now to avoid closure."""

# Split into lines
lines = data.strip().split('\n')

# Split each line into label and message
messages = []
for line in lines:
    label, message = line.split('\t', 1)
    messages.append((label, message))

# Print result
for label, msg in messages:
    print(f"Label: {label} | Message: {msg}")


Label: ham | Message: I'll call you later, I’m in a meeting now.
Label: spam | Message: Congratulations! You’ve won a free iPhone. Click here to claim.
Label: ham | Message: Don’t forget to bring your notebook tomorrow.
Label: spam | Message: You have been selected for a cash prize. Reply YES to claim.
Label: ham | Message: Can we reschedule our lunch to next week?
Label: spam | Message: Get cheap loans now with 0% interest. Apply today!
Label: ham | Message: Meeting is confirmed for 3 PM today.
Label: spam | Message: Win a brand new car! Just send WIN to 55555.
Label: ham | Message: Let’s catch up this weekend if you're free.
Label: spam | Message: Your account has been suspended. Verify now to avoid closure.


Data Cleaning and Preprocessing

In [2]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [4]:
corpus = []
for i in range(0,len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages[i][1])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
  cleaned_review = " ".join(review)
  corpus.append(cleaned_review)  # <- this line is very important
corpus

['call later meet',
 'congratul free iphon click claim',
 'forget bring notebook tomorrow',
 'select cash prize repli ye claim',
 'reschedul lunch next week',
 'get cheap loan interest appli today',
 'meet confirm pm today',
 'win brand new car send win',
 'let catch weekend free',
 'account suspend verifi avoid closur']

In [9]:
## create the bag of words
from sklearn.feature_extraction.text import CountVectorizer
#max_features - pick the top 2500 words with high frequency
## this is becoz say there are 10000 words, the matrix size will be high
## to get rid of this we use max_features
cv=CountVectorizer(max_features=50,binary=True,ngram_range=(1,2))
X=cv.fit_transform(corpus).toarray()
X.shape

(10, 50)

In [10]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0

##N GRAMS


In [11]:
cv.vocabulary_

{'call': np.int64(9),
 'later': np.int64(37),
 'meet': np.int64(45),
 'call later': np.int64(10),
 'later meet': np.int64(38),
 'congratul': np.int64(25),
 'free': np.int64(29),
 'iphon': np.int64(35),
 'click': np.int64(20),
 'claim': np.int64(19),
 'congratul free': np.int64(26),
 'free iphon': np.int64(30),
 'iphon click': np.int64(36),
 'click claim': np.int64(21),
 'forget': np.int64(27),
 'bring': np.int64(7),
 'forget bring': np.int64(28),
 'bring notebook': np.int64(8),
 'cash': np.int64(13),
 'cash prize': np.int64(14),
 'lunch': np.int64(43),
 'lunch next': np.int64(44),
 'get': np.int64(31),
 'cheap': np.int64(17),
 'loan': np.int64(41),
 'interest': np.int64(33),
 'appli': np.int64(2),
 'today': np.int64(49),
 'get cheap': np.int64(32),
 'cheap loan': np.int64(18),
 'loan interest': np.int64(42),
 'interest appli': np.int64(34),
 'appli today': np.int64(3),
 'confirm': np.int64(23),
 'meet confirm': np.int64(46),
 'confirm pm': np.int64(24),
 'brand': np.int64(6),
 'new': n