In [None]:
# # 📘 Bag of Words Practical (Spam Classification)
# # 📌 Step 1: Load the Data
import pandas as pd
import os
import zipfile
import urllib.request

# Download and extract the dataset if not already present
if not os.path.exists('SMSSpamCollection'):
    zip_path = 'smsspamcollection.zip'
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall()

# Load the data
messages = pd.read_csv('SMSSpamCollection',
                       sep='\t',
                       names=["label", "message"])
messages.head()

# | Index | label    | message                                           |
# | ----- | -------- | ------------------------------------------------- |
# | 0     | **ham**  | Go until jurong point, crazy.. Available only ... |
# | 1     | **ham**  | Ok lar... Joking wif u oni...                     |
# | 2     | **spam** | Free entry in 2 a wkly comp to win FA Cup fina... |
# | 3     | **ham**  | U dun say so early hor... U c already then say... |
# | 4     | **ham**  | Nah I don't think he goes to usf, he lives aro... |

# 📌 Step 2: Preprocess the Data (Clean the Text)
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# 💡 Explanation:
# •	We import libraries for text cleaning.
# •	stopwords are common words like "the", "is", etc., that don’t add much meaning.
# •	PorterStemmer is used to reduce words like “running”, “ran” to “run”.

# 📌 Step 3: Clean Each Message
corpus = []
ps = PorterStemmer()

for i in range(0, len(messages)):
    # other than 'a-z' and 'A-Z' remove special characters with ' '
    # In message we have label message
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])  
    review = review.lower()  # lowercase everything
    review = review.split()  # split into list of words
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    # print(corpus)
# 💡 Explanation:
# •	Removes non-alphabet characters.
# •	Converts message to lowercase.
# •	Splits the message into words.
# •	Removes stopwords.
# •	Applies stemming.
# •	Joins it back into a cleaned sentence.
# •	All cleaned messages are saved in corpus.
# 📌 Example:
# Original: "Congratulations!!! You've won a free ticket..."
# Cleaned: "congratul win free ticket"

# 📌 Create Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100,binary=True)
X = cv.fit_transform(corpus).toarray()
X[:5]
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
#         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
#        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
#         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
# X.shape
#(5572, 100)
# 💡 Explanation:
# •	binary=True means we only care about presence (1) or absence (0) of the word.
# •	Even if a word is present 2 times, it will be recorded as 1.

#-------------------------------------------------------------------------------------------------------------

# # 📌 Step 2: Unigram (1,1)
# 🔹 Only single words will be considered.
# Unigram model (1,1)
cv_uni = CountVectorizer(max_features=100, ngram_range=(1, 1), binary=True)
X_uni = cv_uni.fit_transform(corpus).toarray()

print("Vocabulary (Unigram):")
print(cv_uni.vocabulary_)
print("Shape of Unigram X:", X_uni.shape)

# Vocabulary (Unigram):
# {'go': np.int64(22), 'great': np.int64(25), 'got': np.int64(24), 'wat': np.int64(90), 'ok': np.int64(56), 'free': np.int64(18), 'win': np.int64(94), 'text': np.int64(77), 'txt': np.int64(85), 'say': np.int64(67), 'alreadi': np.int64(0), 'think': np.int64(80), 'hey': np.int64(28), 'week': np.int64(92), 'back': np.int64(3), 'like': np.int64(38), 'still': np.int64(73), 'send': np.int64(69), 'even': np.int64(15), 'friend': np.int64(19), 'prize': np.int64(62), 'claim': np.int64(7), 'call': np.int64(4), 'mobil': np.int64(47), 'co': np.int64(8), 'home': np.int64(30), 'want': np.int64(89), 'today': np.int64(82), 'cash': np.int64(6), 'day': np.int64(12), 'repli': np.int64(64), 'www': np.int64(96), 'right': np.int64(65), 'thank': np.int64(78), 'take': np.int64(75), 'time': np.int64(81), 'use': np.int64(87), 'messag': np.int64(44), 'oh': np.int64(55), 'ye': np.int64(97), 'make': np.int64(42), 'way': np.int64(91), 'feel': np.int64(16), 'dont': np.int64(14), 'miss': np.int64(46), 'ur': np.int64(86), 'tri': np.int64(84), 'da': np.int64(11), 'lor': np.int64(39), 'meet': np.int64(43), 'realli': np.int64(63), 'get': np.int64(20), 'know': np.int64(33), 'love': np.int64(40), 'let': np.int64(37), 'work': np.int64(95), 'wait': np.int64(88), 'yeah': np.int64(98), 'tell': np.int64(76), 'pleas': np.int64(61), 'msg': np.int64(49), 'see': np.int64(68), 'pl': np.int64(60), 'need': np.int64(51), 'tomorrow': np.int64(83), 'hope': np.int64(31), 'well': np.int64(93), 'lt': np.int64(41), 'gt': np.int64(26), 'ask': np.int64(1), 'morn': np.int64(48), 'happi': np.int64(27), 'sorri': np.int64(72), 'give': np.int64(21), 'new': np.int64(52), 'find': np.int64(17), 'year': np.int64(99), 'later': np.int64(35), 'pick': np.int64(59), 'good': np.int64(23), 'come': np.int64(9), 'said': np.int64(66), 'hi': np.int64(29), 'babe': np.int64(2), 'im': np.int64(32), 'much': np.int64(50), 'stop': np.int64(74), 'one': np.int64(57), 'night': np.int64(53), 'servic': np.int64(70), 'dear': np.int64(13), 'thing': np.int64(79), 'contact': np.int64(10), 'last': np.int64(34), 'min': np.int64(45), 'number': np.int64(54), 'leav': np.int64(36), 'sleep': np.int64(71), 'care': np.int64(5), 'phone': np.int64(58)}
# Shape of Unigram X: (5572, 100)

# ✅ Explanation:
# •	ngram_range=(1,1) means only unigrams.
# •	binary=True tracks only presence (1) or absence (0).
# •	max_features=100 keeps top 100 frequent words.


# 📌 Step 3: Unigram + Bigram (1,2)
# Unigram + Bigram model (1,2)
cv_uni_bi = CountVectorizer(max_features=100, ngram_range=(1, 2), binary=True)
X_uni_bi = cv_uni_bi.fit_transform(corpus).toarray()

print("Vocabulary (Unigram + Bigram):")
print(cv_uni_bi.vocabulary_)
print("Shape of Unigram+Bigram X:", X_uni_bi.shape)
# Vocabulary (Unigram + Bigram):
# {'go': np.int64(21), 'great': np.int64(24), 'got': np.int64(23), 'wat': np.int64(90), 'ok': np.int64(56), 'free': np.int64(17), 'win': np.int64(94), 'text': np.int64(77), 'txt': np.int64(85), 'say': np.int64(67), 'alreadi': np.int64(0), 'think': np.int64(80), 'hey': np.int64(27), 'week': np.int64(92), 'back': np.int64(3), 'like': np.int64(37), 'still': np.int64(73), 'send': np.int64(69), 'even': np.int64(14), 'friend': np.int64(18), 'prize': np.int64(62), 'claim': np.int64(7), 'call': np.int64(4), 'mobil': np.int64(47), 'co': np.int64(8), 'home': np.int64(29), 'want': np.int64(89), 'today': np.int64(82), 'cash': np.int64(6), 'day': np.int64(11), 'repli': np.int64(64), 'www': np.int64(96), 'right': np.int64(65), 'thank': np.int64(78), 'take': np.int64(75), 'time': np.int64(81), 'use': np.int64(87), 'messag': np.int64(44), 'oh': np.int64(55), 'ye': np.int64(97), 'make': np.int64(42), 'way': np.int64(91), 'feel': np.int64(15), 'dont': np.int64(13), 'miss': np.int64(46), 'ur': np.int64(86), 'tri': np.int64(84), 'da': np.int64(10), 'lor': np.int64(38), 'meet': np.int64(43), 'realli': np.int64(63), 'get': np.int64(19), 'know': np.int64(32), 'love': np.int64(39), 'let': np.int64(36), 'work': np.int64(95), 'wait': np.int64(88), 'yeah': np.int64(98), 'tell': np.int64(76), 'pleas': np.int64(61), 'msg': np.int64(49), 'see': np.int64(68), 'pl': np.int64(60), 'need': np.int64(51), 'tomorrow': np.int64(83), 'hope': np.int64(30), 'well': np.int64(93), 'lt': np.int64(40), 'gt': np.int64(25), 'lt gt': np.int64(41), 'ask': np.int64(1), 'morn': np.int64(48), 'happi': np.int64(26), 'sorri': np.int64(72), 'give': np.int64(20), 'new': np.int64(52), 'find': np.int64(16), 'year': np.int64(99), 'later': np.int64(34), 'pick': np.int64(59), 'good': np.int64(22), 'come': np.int64(9), 'said': np.int64(66), 'hi': np.int64(28), 'babe': np.int64(2), 'im': np.int64(31), 'much': np.int64(50), 'stop': np.int64(74), 'one': np.int64(57), 'night': np.int64(53), 'servic': np.int64(70), 'dear': np.int64(12), 'thing': np.int64(79), 'last': np.int64(33), 'min': np.int64(45), 'number': np.int64(54), 'leav': np.int64(35), 'sleep': np.int64(71), 'care': np.int64(5), 'phone': np.int64(58)}
# Shape of Unigram+Bigram X: (5572, 100)

# ✅ Explanation:
# •	Now we get single words and 2-word combinations.
# •	Useful to capture basic context (e.g. "please call").

# 📌 Step 4: Bigram Only (2,2)
# Bigram only model (2,2)
cv_bi = CountVectorizer(max_features=100, ngram_range=(2, 2), binary=True)
X_bi = cv_bi.fit_transform(corpus).toarray()

print("Vocabulary (Bigram only):")
print(cv_bi.vocabulary_)
print("Shape of Bigram X:", X_bi.shape)
# Vocabulary (Bigram only):
# {'free entri': np.int64(31), 'claim call': np.int64(15), 'call claim': np.int64(3), 'claim code': np.int64(16), 'free call': np.int64(30), 'chanc win': np.int64(14), 'txt word': np.int64(88), 'let know': np.int64(53), 'go home': np.int64(35), 'pleas call': np.int64(67), 'lt gt': np.int64(57), 'want go': np.int64(96), 'like lt': np.int64(54), 'sorri call': np.int64(80), 'call later': np.int64(8), 'ur award': np.int64(90), 'call custom': np.int64(4), 'custom servic': np.int64(23), 'cash prize': np.int64(13), 'po box': np.int64(68), 'tri contact': np.int64(85), 'draw show': np.int64(28), 'show prize': np.int64(79), 'prize guarante': np.int64(73), 'guarante call': np.int64(42), 'valid hr': np.int64(94), 'select receiv': np.int64(76), 'privat account': np.int64(71), 'account statement': np.int64(0), 'statement show': np.int64(81), 'call identifi': np.int64(5), 'identifi code': np.int64(49), 'code expir': np.int64(20), 'urgent mobil': np.int64(93), 'call landlin': np.int64(7), 'wat time': np.int64(97), 'ur mob': np.int64(92), 'gud ni': np.int64(44), 'new year': np.int64(62), 'send stop': np.int64(78), 'get back': np.int64(33), 'co uk': np.int64(19), 'gud mrng': np.int64(43), 'nice day': np.int64(63), 'lt decim': np.int64(56), 'decim gt': np.int64(25), 'txt nokia': np.int64(86), 'good morn': np.int64(37), 'ur friend': np.int64(91), 'good night': np.int64(38), 'repli call': np.int64(74), 'last night': np.int64(52), 'camera phone': np.int64(12), 'pick phone': np.int64(65), 'right pl': np.int64(75), 'pl send': np.int64(66), 'send messag': np.int64(77), 'great day': np.int64(39), 'suit land': np.int64(82), 'land row': np.int64(51), 'good afternoon': np.int64(36), 'take care': np.int64(83), 'doubl min': np.int64(27), 'call mobileupd': np.int64(9), 'call optout': np.int64(10), 'gt min': np.int64(41), 'half price': np.int64(45), 'txt stop': np.int64(87), 'date servic': np.int64(24), 'pobox wq': np.int64(69), 'mobil number': np.int64(58), 'call land': np.int64(6), 'land line': np.int64(50), 'line claim': np.int64(55), 'claim valid': np.int64(18), 'watch tv': np.int64(98), 'gt lt': np.int64(40), 'hope good': np.int64(48), 'free text': np.int64(32), 'holiday cash': np.int64(47), 'prize claim': np.int64(72), 'nd attempt': np.int64(60), 'attempt contact': np.int64(1), 'claim ur': np.int64(17), 'un redeem': np.int64(89), 'point call': np.int64(70), 'ok lor': np.int64(64), 'want come': np.int64(95), 'everi week': np.int64(29), 'come home': np.int64(22), 'new nokia': np.int64(61), 'happi new': np.int64(46), 'nation rate': np.int64(59), 'week txt': np.int64(99), 'tell ur': np.int64(84), 'gift voucher': np.int64(34), 'await collect': np.int64(2), 'dont know': np.int64(26), 'come back': np.int64(21), 'call per': np.int64(11)}
# Shape of Bigram X: (5572, 100)
# ✅ Explanation:
# •	Only two-word combinations (bigrams).
# •	No single words included.

# 📌 Step 5: Trigram Only (3,3)
# Trigram only model (3,3)
cv_tri = CountVectorizer(max_features=200, ngram_range=(3, 3), binary=True)
X_tri = cv_tri.fit_transform(corpus).toarray()

print("Vocabulary (Trigram only):")
print(cv_tri.vocabulary_)
print("Shape of Trigram X:", X_tri.shape)
# Vocabulary (Trigram only):
# {'call claim code': np.int64(10), 'entitl updat latest': np.int64(51), 'updat latest colour': np.int64(179), 'free call mobil': np.int64(58), 'call mobil updat': np.int64(16), 'chanc win cash': np.int64(26), 'repli hl info': np.int64(138), 'like lt gt': np.int64(84), 'sorri call later': np.int64(152), 'call later meet': np.int64(15), 'pleas call custom': np.int64(123), 'call custom servic': np.int64(11), 'custom servic repres': np.int64(40), 'pm guarante cash': np.int64(125), 'guarante cash prize': np.int64(69), 'last weekend draw': np.int64(81), 'weekend draw show': np.int64(191), 'draw show prize': np.int64(47), 'show prize guarante': np.int64(148), 'prize guarante call': np.int64(135), 'guarante call claim': np.int64(67), 'valid hr ppm': np.int64(188), 'special select receiv': np.int64(154), 'speak live oper': np.int64(153), 'live oper claim': np.int64(86), 'privat account statement': np.int64(133), 'account statement show': np.int64(0), 'call identifi code': np.int64(12), 'identifi code expir': np.int64(76), 'bonu caller prize': np.int64(7), 'end select receiv': np.int64(50), 'select receiv award': np.int64(147), 'match pleas call': np.int64(100), 'invit friend repli': np.int64(77), 'friend repli ye': np.int64(62), 'repli ye see': np.int64(139), 'ye see www': np.int64(199), 'see www sm': np.int64(146), 'stop send stop': np.int64(158), 'urgent tri contact': np.int64(187), 'lt decim gt': np.int64(89), 'secret admir look': np.int64(145), 'admir look make': np.int64(1), 'look make contact': np.int64(88), 'make contact find': np.int64(99), 'contact find reveal': np.int64(33), 'find reveal think': np.int64(57), 'reveal think ur': np.int64(140), 'think ur special': np.int64(166), 'ur special call': np.int64(183), 'congratul ur award': np.int64(32), 'draw txt music': np.int64(48), 'txt music tnc': np.int64(174), 'tnc www ldew': np.int64(168), 'www ldew com': np.int64(197), 'ldew com win': np.int64(83), 'win ppmx age': np.int64(192), 'anytim network min': np.int64(2), 'camcord repli call': np.int64(21), 'cant pick phone': np.int64(22), 'pick phone right': np.int64(120), 'phone right pl': np.int64(119), 'right pl send': np.int64(141), 'pl send messag': np.int64(122), 'latest colour camera': np.int64(82), 'ur cash balanc': np.int64(180), 'cash balanc current': np.int64(24), 'balanc current pound': np.int64(5), 'current pound maxim': np.int64(38), 'pound maxim ur': np.int64(131), 'maxim ur cash': np.int64(102), 'ur cash send': np.int64(181), 'hg suit land': np.int64(72), 'suit land row': np.int64(160), 'land row hl': np.int64(79), 'doubl min txt': np.int64(46), 'call mobileupd call': np.int64(17), 'mobileupd call optout': np.int64(108), 'lt gt min': np.int64(92), 'thank rington order': np.int64(165), 'free entri weekli': np.int64(60), 'urgent mobil number': np.int64(185), 'mobil number award': np.int64(107), 'guarante call land': np.int64(68), 'call land line': np.int64(13), 'land line claim': np.int64(78), 'line claim valid': np.int64(85), 'claim valid hr': np.int64(29), 'lt gt th': np.int64(95), 'lt gt dollar': np.int64(90), 'lt gt lt': np.int64(91), 'gt lt gt': np.int64(66), 'tenerif holiday cash': np.int64(163), 'meet call later': np.int64(103), 'caller prize nd': np.int64(20), 'prize nd attempt': np.int64(136), 'nd attempt contact': np.int64(110), 'attempt contact call': np.int64(3), 'claim ur worth': np.int64(28), 'ur worth discount': np.int64(184), 'worth discount voucher': np.int64(194), 'savamob member offer': np.int64(144), 'offer mobil cs': np.int64(116), 'statement show un': np.int64(156), 'show un redeem': np.int64(149), 'un redeem point': np.int64(178), 'redeem point call': np.int64(137), 'point call identifi': np.int64(130), 'new video phone': np.int64(112), 'custom servic announc': np.int64(39), 'dear voucher holder': np.int64(42), 'happi new year': np.int64(71), 'everi wk txt': np.int64(54), 'lt gt minut': np.int64(93), 'lt time gt': np.int64(96), 'free st week': np.int64(61), 'st week nokia': np.int64(155), 'week nokia tone': np.int64(189), 'nokia tone ur': np.int64(114), 'everi week txt': np.int64(53), 'week txt nokia': np.int64(190), 'txt nokia get': np.int64(175), 'nokia get txting': np.int64(113), 'get txting tell': np.int64(63), 'txting tell ur': np.int64(176), 'tell ur mate': np.int64(162), 'mate www getz': np.int64(101), 'www getz co': np.int64(196), 'getz co uk': np.int64(64), 'co uk pobox': np.int64(30), 'uk pobox wq': np.int64(177), 'pobox wq norm': np.int64(129), 'wq norm tone': np.int64(195), 'good morn dear': np.int64(65), 'call landlin complimentari': np.int64(14), 'holiday cash await': np.int64(74), 'cash await collect': np.int64(23), 'await collect sae': np.int64(4), 'collect sae cs': np.int64(31), 'ts cs www': np.int64(173), 'half price line': np.int64(70), 'price line rental': np.int64(132), 'stop text call': np.int64(159), 'pleas call landlin': np.int64(124), 'tri contact today': np.int64(172), 'contact today draw': np.int64(34), 'today draw show': np.int64(170), 'tone ur mob': np.int64(171), 'ur mob everi': np.int64(182), 'mob everi week': np.int64(106), 'final attempt contact': np.int64(55), 'digit camera call': np.int64(45), 'landlin deliveri within': np.int64(80), 'deliveri within day': np.int64(44), 'lucki day find': np.int64(98), 'day find log': np.int64(41), 'find log onto': np.int64(56), 'log onto http': np.int64(87), 'onto http www': np.int64(117), 'http www urawinn': np.int64(75), 'row hl ldn': np.int64(142), 'lt gt rs': np.int64(94), 'entri weekli comp': np.int64(52), 'min stop text': np.int64(105), 'call per min': np.int64(18), 'per min ntt': np.int64(118), 'min ntt ltd': np.int64(104), 'ntt ltd po': np.int64(115), 'ltd po box': np.int64(97), 'po box croydon': np.int64(127), 'box croydon cr': np.int64(8), 'croydon cr wb': np.int64(37), 'cash everi wk': np.int64(25), 'nation rate call': np.int64(109), 'bt nation rate': np.int64(9), 'nd time tri': np.int64(111), 'time tri contact': np.int64(167), 'prize claim easi': np.int64(134), 'claim easi call': np.int64(27), 'easi call per': np.int64(49), 'urgent pleas call': np.int64(186), 'costa del sol': np.int64(36), 'del sol holiday': np.int64(43), 'sol holiday await': np.int64(151), 'holiday await collect': np.int64(73), 'call toclaim sae': np.int64(19), 'toclaim sae tc': np.int64(169), 'sae tc pobox': np.int64(143), 'tc pobox stockport': np.int64(161), 'pobox stockport sk': np.int64(128), 'stockport sk xh': np.int64(157), 'sk xh cost': np.int64(150), 'xh cost pm': np.int64(198), 'cost pm max': np.int64(35), 'pm max min': np.int64(126), 'pl convey birthday': np.int64(121), 'birthday wish nimya': np.int64(6), 'wish nimya pl': np.int64(193), 'text free camcord': np.int64(164), 'free camcord repli': np.int64(59)}
# Shape of Trigram X: (5572, 200)
# ✅ Explanation:
# •	Extracts 3-word combinations (e.g. "call customer service").

# 📌 Step 6: Bigram + Trigram (2,3)
# Bigram + Trigram model (2,3)
cv_bi_tri = CountVectorizer(max_features=300, ngram_range=(2, 3), binary=True)
X_bi_tri = cv_bi_tri.fit_transform(corpus).toarray()
print("Vocabulary (Bigram + Trigram):")
print(cv_bi_tri.vocabulary_)
print("Shape of Bigram+Trigram X:", X_bi_tri.shape)
# Vocabulary (Bigram + Trigram):
# {'free entri': np.int64(80), 'rate appli': np.int64(207), 'claim call': np.int64(42), 'call claim': np.int64(14), 'claim code': np.int64(43), 'free call': np.int64(78), 'call mobil': np.int64(25), 'chanc win': np.int64(41), 'win cash': np.int64(294), 'txt word': np.int64(267), 'let know': np.int64(136), 'feel like': np.int64(76), 'repli ye': np.int64(214), 'go home': np.int64(93), 'call repli': np.int64(31), 'mobil free': np.int64(163), 'free camcord': np.int64(79), 'pleas call': np.int64(192), 'lt gt': np.int64(148), 'miss call': np.int64(160), 'want go': np.int64(288), 'first time': np.int64(77), 'like lt': np.int64(138), 'like lt gt': np.int64(139), 'sm ac': np.int64(232), 'sorri call': np.int64(233), 'call later': np.int64(23), 'sorri call later': np.int64(234), 'award bonu': np.int64(9), 'prize call': np.int64(200), 'ur award': np.int64(272), 'call free': np.int64(17), 'that cool': np.int64(254), 'call custom': np.int64(15), 'custom servic': np.int64(58), 'servic repres': np.int64(227), 'guarante cash': np.int64(108), 'cash prize': np.int64(40), 'pleas call custom': np.int64(193), 'call custom servic': np.int64(16), 'custom servic repres': np.int64(59), 'guarante cash prize': np.int64(109), 'po box': np.int64(194), 'tri contact': np.int64(261), 'draw show': np.int64(64), 'show prize': np.int64(228), 'prize guarante': np.int64(203), 'guarante call': np.int64(106), 'valid hr': np.int64(283), 'draw show prize': np.int64(65), 'show prize guarante': np.int64(229), 'prize guarante call': np.int64(204), 'special select': np.int64(238), 'select receiv': np.int64(224), 'speak live': np.int64(235), 'live oper': np.int64(143), 'oper claim': np.int64(183), 'special select receiv': np.int64(239), 'speak live oper': np.int64(236), 'privat account': np.int64(198), 'account statement': np.int64(0), 'statement show': np.int64(241), 'call identifi': np.int64(18), 'identifi code': np.int64(126), 'code expir': np.int64(50), 'privat account statement': np.int64(199), 'account statement show': np.int64(1), 'call identifi code': np.int64(19), 'identifi code expir': np.int64(127), 'urgent mobil': np.int64(279), 'bonu caller': np.int64(10), 'caller prize': np.int64(32), 'call landlin': np.int64(22), 'bonu caller prize': np.int64(11), 'receiv award': np.int64(209), 'match pleas': np.int64(157), 'match pleas call': np.int64(158), 'wat time': np.int64(290), 'give call': np.int64(90), 'ur mob': np.int64(275), 'go get': np.int64(92), 'gud ni': np.int64(111), 'repli stop': np.int64(213), 'new year': np.int64(175), 'send stop': np.int64(226), 'ur mobil': np.int64(277), 'urgent tri': np.int64(281), 'urgent tri contact': np.int64(282), 'get back': np.int64(86), 'reach home': np.int64(208), 'co uk': np.int64(48), 'keep touch': np.int64(128), 'gud mrng': np.int64(110), 'nice day': np.int64(177), 'lt decim': np.int64(146), 'decim gt': np.int64(61), 'lt decim gt': np.int64(147), 'like dat': np.int64(137), 'txt nokia': np.int64(265), 'good morn': np.int64(98), 'ur friend': np.int64(273), 'secret admir': np.int64(222), 'admir look': np.int64(3), 'look make': np.int64(144), 'make contact': np.int64(154), 'reveal think': np.int64(215), 'think ur': np.int64(255), 'special call': np.int64(237), 'secret admir look': np.int64(223), 'admir look make': np.int64(4), 'make contact find': np.int64(155), 'reveal think ur': np.int64(216), 'ur special call': np.int64(278), 'good night': np.int64(99), 'draw txt': np.int64(66), 'txt music': np.int64(264), 'www ldew': np.int64(298), 'ldew com': np.int64(135), 'draw txt music': np.int64(67), 'www ldew com': np.int64(299), 'anytim network': np.int64(5), 'network min': np.int64(171), 'camcord repli': np.int64(33), 'repli call': np.int64(212), 'anytim network min': np.int64(6), 'camcord repli call': np.int64(34), 'text stop': np.int64(252), 'last night': np.int64(133), 'camera phone': np.int64(35), 'cant pick': np.int64(36), 'pick phone': np.int64(188), 'phone right': np.int64(186), 'right pl': np.int64(217), 'pl send': np.int64(190), 'send messag': np.int64(225), 'cant pick phone': np.int64(37), 'pick phone right': np.int64(189), 'phone right pl': np.int64(187), 'right pl send': np.int64(218), 'pl send messag': np.int64(191), 'great day': np.int64(100), 'mobil mth': np.int64(164), 'love ya': np.int64(145), 'hg suit': np.int64(117), 'suit land': np.int64(245), 'land row': np.int64(131), 'row hl': np.int64(219), 'hg suit land': np.int64(118), 'suit land row': np.int64(246), 'land row hl': np.int64(132), 'good afternoon': np.int64(96), 'take care': np.int64(248), 'doubl min': np.int64(63), 'min txt': np.int64(159), 'call mobileupd': np.int64(26), 'mobileupd call': np.int64(167), 'call optout': np.int64(28), 'call mobileupd call': np.int64(27), 'mobileupd call optout': np.int64(168), 'gt min': np.int64(103), 'lt gt min': np.int64(150), 'per week': np.int64(185), 'half price': np.int64(113), 'line rental': np.int64(142), 'txt stop': np.int64(266), 'date servic': np.int64(60), 'entri weekli': np.int64(71), 'free entri weekli': np.int64(81), 'pobox wq': np.int64(195), 'last week': np.int64(134), 'mobil number': np.int64(165), 'call land': np.int64(20), 'land line': np.int64(129), 'line claim': np.int64(140), 'claim valid': np.int64(46), 'urgent mobil number': np.int64(280), 'mobil number award': np.int64(166), 'guarante call land': np.int64(107), 'call land line': np.int64(21), 'land line claim': np.int64(130), 'line claim valid': np.int64(141), 'claim valid hr': np.int64(47), 'watch tv': np.int64(291), 'call min': np.int64(24), 'wc xx': np.int64(292), 'gt th': np.int64(105), 'lt gt th': np.int64(152), 'gt lt': np.int64(101), 'lt gt lt': np.int64(149), 'gt lt gt': np.int64(102), 'hope good': np.int64(124), 'good day': np.int64(97), 'free text': np.int64(85), 'text back': np.int64(251), 'holiday cash': np.int64(122), 'prize claim': np.int64(201), 'tone repli': np.int64(258), 'http www': np.int64(125), 'prize nd': np.int64(205), 'nd attempt': np.int64(170), 'attempt contact': np.int64(7), 'prize nd attempt': np.int64(206), 'claim ur': np.int64(45), 'text ye': np.int64(253), 'show un': np.int64(230), 'un redeem': np.int64(270), 'redeem point': np.int64(210), 'point call': np.int64(196), 'statement show un': np.int64(242), 'show un redeem': np.int64(231), 'un redeem point': np.int64(271), 'redeem point call': np.int64(211), 'point call identifi': np.int64(197), 'goe day': np.int64(95), 'across sea': np.int64(2), 'happi birthday': np.int64(114), 'new video': np.int64(173), 'video phone': np.int64(284), 'new video phone': np.int64(174), 'one day': np.int64(182), 'ok lor': np.int64(181), 'want come': np.int64(287), 'make sure': np.int64(156), 'ts cs': np.int64(263), 'stop text': np.int64(243), 'say hi': np.int64(221), 'everi week': np.int64(72), 'come home': np.int64(54), 'new nokia': np.int64(172), 'next week': np.int64(176), 'happi new': np.int64(115), 'happi new year': np.int64(116), 'everi wk': np.int64(74), 'wk txt': np.int64(295), 'everi wk txt': np.int64(75), 'gt minut': np.int64(104), 'lt gt minut': np.int64(151), 'nation rate': np.int64(169), 'free st': np.int64(83), 'st week': np.int64(240), 'nokia tone': np.int64(179), 'tone ur': np.int64(259), 'week txt': np.int64(293), 'nokia get': np.int64(178), 'tell ur': np.int64(250), 'ur mate': np.int64(274), 'www getz': np.int64(296), 'getz co': np.int64(87), 'uk pobox': np.int64(268), 'free st week': np.int64(84), 'nokia tone ur': np.int64(180), 'everi week txt': np.int64(73), 'www getz co': np.int64(297), 'getz co uk': np.int64(88), 'co uk pobox': np.int64(49), 'uk pobox wq': np.int64(269), 'gift voucher': np.int64(89), 'cash await': np.int64(38), 'await collect': np.int64(8), 'sae cs': np.int64(220), 'holiday cash await': np.int64(123), 'cash await collect': np.int64(39), 'collect sae cs': np.int64(52), 'wan go': np.int64(286), 'dont know': np.int64(62), 'stop text call': np.int64(244), 'contact today': np.int64(56), 'today draw': np.int64(256), 'tri contact today': np.int64(262), 'contact today draw': np.int64(57), 'today draw show': np.int64(257), 'come back': np.int64(53), 'wait till': np.int64(285), 'mob everi': np.int64(161), 'tone ur mob': np.int64(260), 'ur mob everi': np.int64(276), 'mob everi week': np.int64(162), 'gud nyt': np.int64(112), 'take part': np.int64(249), 'call per': np.int64(29), 'per min': np.int64(184), 'ltd po': np.int64(153), 'call per min': np.int64(30), 'sweet dream': np.int64(247), 'bt nation': np.int64(12), 'bt nation rate': np.int64(13), 'go back': np.int64(91), 'go sleep': np.int64(94), 'free msg': np.int64(82), 'dun wan': np.int64(68), 'come tomorrow': np.int64(55), 'claim easi': np.int64(44), 'easi call': np.int64(69), 'prize claim easi': np.int64(202), 'easi call per': np.int64(70), 'holiday await': np.int64(120), 'collect call': np.int64(51), 'holiday await collect': np.int64(121), 'hi darlin': np.int64(119), 'want new': np.int64(289)}
# Shape of Bigram+Trigram X: (5572, 300)





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Vocabulary (Unigram):
{'go': np.int64(22), 'great': np.int64(25), 'got': np.int64(24), 'wat': np.int64(90), 'ok': np.int64(56), 'free': np.int64(18), 'win': np.int64(94), 'text': np.int64(77), 'txt': np.int64(85), 'say': np.int64(67), 'alreadi': np.int64(0), 'think': np.int64(80), 'hey': np.int64(28), 'week': np.int64(92), 'back': np.int64(3), 'like': np.int64(38), 'still': np.int64(73), 'send': np.int64(69), 'even': np.int64(15), 'friend': np.int64(19), 'prize': np.int64(62), 'claim': np.int64(7), 'call': np.int64(4), 'mobil': np.int64(47), 'co': np.int64(8), 'home': np.int64(30), 'want': np.int64(89), 'today': np.int64(82), 'cash': np.int64(6), 'day': np.int64(12), 'repli': np.int64(64), 'www': np.int64(96), 'right': np.int64(65), 'thank': np.int64(78), 'take': np.int64(75), 'time': np.int64(81), 'use': np.int64(87), 'messag': np.int64(44), 'oh': np.int64(55), 'ye': np.int64(97), 'make': np.int64(42), 'way': np.int64(91), 'feel': np.int64(16), 'dont': np.int64(14), 'miss': np.int64(4