In [None]:
# # 📘 Bag of Words Practical (Spam Classification)
# # 📌 Step 1: Load the Data
import pandas as pd
import os
import zipfile
import urllib.request

# Download and extract the dataset if not already present
if not os.path.exists('SMSSpamCollection'):
    zip_path = 'smsspamcollection.zip'
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall()

# Load the data
messages = pd.read_csv('SMSSpamCollection',
                       sep='\t',
                       names=["label", "message"])
messages.head()
# | Index | label    | message                                           |
# | ----- | -------- | ------------------------------------------------- |
# | 0     | **ham**  | Go until jurong point, crazy.. Available only ... |
# | 1     | **ham**  | Ok lar... Joking wif u oni...                     |
# | 2     | **spam** | Free entry in 2 a wkly comp to win FA Cup fina... |
# | 3     | **ham**  | U dun say so early hor... U c already then say... |
# | 4     | **ham**  | Nah I don't think he goes to usf, he lives aro... |


# 📌 Step 2: Preprocess the Data (Clean the Text)
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# 💡 Explanation:
# •	We import libraries for text cleaning.
# •	stopwords are common words like "the", "is", etc., that don’t add much meaning.
# •	PorterStemmer is used to reduce words like “running”, “ran” to “run”.

# 📌 Step 3: Clean Each Message
corpus = []
ps = PorterStemmer()

for i in range(0, len(messages)):
    # other than 'a-z' and 'A-Z' remove special characters with ' '
    # In message we have label message
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])  
    review = review.lower()  # lowercase everything
    review = review.split()  # split into list of words
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    # print(corpus)
# 💡 Explanation:
# •	Removes non-alphabet characters.
# •	Converts message to lowercase.
# •	Splits the message into words.
# •	Removes stopwords.
# •	Applies stemming.
# •	Joins it back into a cleaned sentence.
# •	All cleaned messages are saved in corpus.
# 📌 Example:
# Original: "Congratulations!!! You've won a free ticket..."
# Cleaned: "congratul win free ticket"

# 📌 Step 4: Create the Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
# Create the Bag of Words model
# ngram_range is used to consider both unigrams (single words) and bigrams (two-word combinations).
 # Limit to 2500 most frequent words
cv = CountVectorizer(max_features=2500,ngram_range=(1,2)) 
#Independent variable
X = cv.fit_transform(corpus).toarray()  # Convert to array

import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000,
                       formatter=dict(float=lambda x: "%.3g" % x))
X
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], shape=(5572, 2500))

# 💡 Explanation
# •	CountVectorizer converts the cleaned text into a matrix of token counts.
# •	max_features=2500 limits the vocabulary to the 2500 most frequent words.   
# •	ngram_range=(1,2) allows both single words and pairs of words (bigrams).

cv.vocabulary_
# {'go': np.int64(823),
#  'point': np.int64(1627),
#  'crazi': np.int64(458),
#  'avail': np.int64(126),
#  'bugi': np.int64(237),
#  'great': np.int64(871),
#  'world': np.int64(2423),
#  'la': np.int64(1106),
#  'cine': np.int64(355),
#  'got': np.int64(862),
#  'wat': np.int64(2323),
#  'ok': np.int64(1476),
#  'lar': np.int64(1119),
#  'joke': np.int64(1065),
#  'wif': np.int64(2376),
#  'oni': np.int64(1497),
#  'free': np.int64(742),
#  'entri': np.int64(623),
#  'wkli': np.int64(2406),
#  'comp': np.int64(408),
#  'win': np.int64(2381),
#  'fa': np.int64(663),
#  'cup': np.int64(473),
#  'final': np.int64(699),
#  'st': np.int64(1962),
#  'may': np.int64(1275),
#  'text': np.int64(2088),
#  'receiv': np.int64(1712),
#  'question': np.int64(1681),
#  'std': np.int64(1978),
#  'txt': np.int64(2199),
#  'rate': np.int64(1693),
#  'appli': np.int64(88),
#  'free entri': np.int64(749),
#  'entri wkli': np.int64(625),
#  'wkli comp': np.int64(2407),
#  'std txt': np.int64(1979),
#  'txt rate': np.int64(2205),
#  'rate appli': np.int64(1694),
#  'dun': np.int64(581),
#  'say': np.int64(1800),
#  'earli': np.int64(587),
#  'alreadi': np.int64(61),
#  'nah': np.int64(1384),
#  'think': np.int64(2108),
#  'goe': np.int64(842),
#  'usf': np.int64(2265),
#  'live': np.int64(1185),
#  'around': np.int64(100),
#  'though': np.int64(2116),
#  'freemsg': np.int64(762),
#  'hey': np.int64(933),
#  'darl': np.int64(491),
#  'week': np.int64(2343),
#  'word': np.int64(2416),
#  'back': np.int64(143),
#  'like': np.int64(1171),
#  'fun': np.int64(779),
#  'still': np.int64(1981),
#  'tb': np.int64(2068),
#  'xxx': np.int64(2459),
#  'send': np.int64(1831),
#  'even': np.int64(634),
#  'brother': np.int64(228),
#  'speak': np.int64(1938),
#  'treat': np.int64(2176),
#  'per': np.int64(1555),
#  'request': np.int64(1744),
#  'mell': np.int64(1286),
#  'oru': np.int64(1522),
#  'set': np.int64(1847),
#  'callertun': np.int64(282),
#  'caller': np.int64(279),
#  'press': np.int64(1654),
#  'copi': np.int64(440),
#  'friend': np.int64(768),
#  'per request': np.int64(1561),
#  'set callertun': np.int64(1848),
#  'callertun caller': np.int64(283),
#  'caller press': np.int64(280),
#  'press copi': np.int64(1655),
#  'copi friend': np.int64(441),
#  'friend callertun': np.int64(769),
#  'winner': np.int64(2390),
#  'valu': np.int64(2272),
#  'network': np.int64(1401),
#  'custom': np.int64(478),
#  'select': np.int64(1825),
#  'prize': np.int64(1663),
#  'reward': np.int64(1755),
#  'claim': np.int64(358),
#  'call': np.int64(253),
#  'code': np.int64(381),
#  'valid': np.int64(2270),
#  'hour': np.int64(976),
#  'claim call': np.int64(359),
#  'call claim': np.int64(257),
#  'claim code': np.int64(360),
#  'mobil': np.int64(1328),
#  'month': np.int64(1348),
#  'entitl': np.int64(621),
#  'updat': np.int64(2230),
#  'latest': np.int64(1131),
#  'colour': np.int64(394),
#  'camera': np.int64(288),
#  'co': np.int64(378),
#  'entitl updat': np.int64(622),
#  'updat latest': np.int64(2232),
#  'latest colour': np.int64(1133),
#  'free call': np.int64(745),
#  'call mobil': np.int64(267),
#  'mobil updat': np.int64(1337),
#  'updat co': np.int64(2231),
#  'co free': np.int64(379),
#  'gonna': np.int64(847),
#  'home': np.int64(960),
#  'soon': np.int64(1925),
#  'want': np.int64(2311),
#  'talk': np.int64(2064),
#  'stuff': np.int64(2013),
#  'anymor': np.int64(77),
#  'tonight': np.int64(2159),
#  'cri': np.int64(462),
#  'enough': np.int64(618),
#  'today': np.int64(2142),
#  'want talk': np.int64(2319),
#  'chanc': np.int64(328),
#  'cash': np.int64(309),
#  'pound': np.int64(1639),
#  'cost': np.int64(443),
#  'day': np.int64(498),
#  'repli': np.int64(1736),
#  'hl': np.int64(945),
#  'info': np.int64(1018),
#  'chanc win': np.int64(330),
#  'win cash': np.int64(2382),
#  'repli hl': np.int64(1738),
#  'hl info': np.int64(946),
#  'urgent': np.int64(2254),
#  'www': np.int64(2447),
#  'net': np.int64(1399),
#  'pobox': np.int64(1623),
#  'ldnw': np.int64(1144),
#  'week free': np.int64(2345),
#  'txt word': np.int64(2207),
#  'search': np.int64(1810),
#  'right': np.int64(1757),
#  'thank': np.int64(2098),
#  'promis': np.int64(1673),
#  'wont': np.int64(2415),
#  'take': np.int64(2061),
#  'help': np.int64(931),
#  'wonder': np.int64(2413),
#  'bless': np.int64(193),
#  'time': np.int64(2128),
#  'date': np.int64(496),
#  'sunday': np.int64(2031),
#  'use': np.int64(2263),
#  'credit': np.int64(460),
#  'click': np.int64(372),
#  'wap': np.int64(2320),
#  'link': np.int64(1180),
#  'next': np.int64(1414),
#  'messag': np.int64(1292),
#  'http': np.int64(985),
#  'com': np.int64(397),
#  'oh': np.int64(1472),
#  'watch': np.int64(2327),
#  'eh': np.int64(598),
#  'rememb': np.int64(1729),
#  'spell': np.int64(1948),
#  'name': np.int64(1386),
#  'ye': np.int64(2470),
#  'naughti': np.int64(1391),
#  'make': np.int64(1248),
#  'wet': np.int64(2364),
#  'fine': np.int64(706),
#  'way': np.int64(2330),
#  'feel': np.int64(683),
#  'england': np.int64(614),
#  'dont': np.int64(561),
#  'miss': np.int64(1315),
#  'goal': np.int64(839),
#  'team': np.int64(2073),
#  'news': np.int64(1413),
#  'ur': np.int64(2233),
#  'nation': np.int64(1388),
#  'eg': np.int64(596),
#  'tri': np.int64(2177),
#  'wq': np.int64(2436),
#  'dont miss': np.int64(563),
#  'serious': np.int64(1843),
#  'ha': np.int64(899),
#  'ha ha': np.int64(900),
#  'pay': np.int64(1549),
#  'first': np.int64(711),
#  'da': np.int64(483),
#  'stock': np.int64(1984),
#  'comin': np.int64(406),
#  'aft': np.int64(34),
#  'finish': np.int64(709),
#  'lunch': np.int64(1237),
#  'str': np.int64(2006),
#  'lor': np.int64(1208),
#  'ard': np.int64(94),
#  'smth': np.int64(1905),
#  'alright': np.int64(62),
#  'meet': np.int64(1282),
#  'forc': np.int64(730),
#  'eat': np.int64(592),
#  'realli': np.int64(1708),
#  'hungri': np.int64(990),
#  'tho': np.int64(2115),
#  'suck': np.int64(2023),
#  'mark': np.int64(1261),
#  'get': np.int64(799),
#  'worri': np.int64(2425),
#  'know': np.int64(1098),
#  'sick': np.int64(1869),
#  'turn': np.int64(2195),
#  'pizza': np.int64(1591),
#  'lol': np.int64(1198),
#  'alway': np.int64(65),
#  'convinc': np.int64(436),
#  'catch': np.int64(318),
#  'bu': np.int64(233),
#  'fri': np.int64(766),
#  'egg': np.int64(597),
#  'tea': np.int64(2071),
#  'mom': np.int64(1343),
#  'left': np.int64(1152),
#  'dinner': np.int64(543),
#  'love': np.int64(1214),
#  'amp': np.int64(68),
#  'pack': np.int64(1531),
#  'car': np.int64(300),
#  'let': np.int64(1159),
#  'room': np.int64(1767),
#  'let know': np.int64(1160),
#  'work': np.int64(2419),
#  'feel like': np.int64(686),
#  'wait': np.int64(2299),
#  'clear': np.int64(371),
#  'sure': np.int64(2043),
#  'us': np.int64(2262),
#  'yeah': np.int64(2475),
#  'till': np.int64(2127),
#  'cheer': np.int64(344),
#  'yeah got': np.int64(2476),
#  'tell': np.int64(2078),
#  'anyth': np.int64(79),
#  'tell anyth': np.int64(2079),
#  'quick': np.int64(1682),
#  'subscript': np.int64(2021),
#  'rington': np.int64(1760),
#  'uk': np.int64(2214),
#  'charg': np.int64(333),
#  'pleas': np.int64(1607),
#  'confirm': np.int64(419),
#  'mobil charg': np.int64(1330),
#  'repli ye': np.int64(1740),
#  'yup': np.int64(2497),
#  'look': np.int64(1204),
#  'msg': np.int64(1368),
#  'learn': np.int64(1146),
#  'nd': np.int64(1392),
#  'lesson': np.int64(1158),
#  'yup ok': np.int64(2498),
#  'ok go': np.int64(1478),
#  'go home': np.int64(829),
#  'oop': np.int64(1501),
#  'roommat': np.int64(1768),
#  'done': np.int64(560),
#  'see': np.int64(1818),
#  'letter': np.int64(1161),
#  'decid': np.int64(515),
#  'anyth lor': np.int64(80),
#  'hello': np.int64(930),
#  'saturday': np.int64(1794),
#  'tomo': np.int64(2150),
#  'invit': np.int64(1027),
#  'pl': np.int64(1592),
#  'ahead': np.int64(43),
#  'weekend': np.int64(2349),
#  'abiola': np.int64(2),
#  'great weekend': np.int64(873),
#  'forget': np.int64(732),
#  'need': np.int64(1396),
#  'crave': np.int64(457),
#  'sweet': np.int64(2052),
#  'want need': np.int64(2316),
#  'burn': np.int64(240),
#  'sm': np.int64(1899),
#  'nokia': np.int64(1428),
#  'camcord': np.int64(284),
#  'deliveri': np.int64(525),
#  'tomorrow': np.int64(2151),
#  'call repli': np.int64(272),
#  'free nokia': np.int64(755),
#  'nokia mobil': np.int64(1433),
#  'mobil free': np.int64(1334),
#  'free camcord': np.int64(746),
#  'pleas call': np.int64(1608),
#  'deliveri tomorrow': np.int64(526),
#  'hope': np.int64(964),
#  'man': np.int64(1253),
#  'well': np.int64(2356),
#  'lt': np.int64(1227),
#  'gt': np.int64(878),
#  'inch': np.int64(1013),
#  'lt gt': np.int64(1229),
#  'miss call': np.int64(1318),
#  'nigeria': np.int64(1419),
#  'tyler': np.int64(2211),
#  'could': np.int64(448),
#  'mayb': np.int64(1276),
#  'ask': np.int64(108),
#  'bit': np.int64(188),
#  'hospit': np.int64(971),
#  'kept': np.int64(1083),
#  'weak': np.int64(2336),
#  'want go': np.int64(2314),
#  'saw': np.int64(1799),
#  'class': np.int64(368),
#  'first time': np.int64(712),
#  'usual': np.int64(2266),
#  'run': np.int64(1777),
#  'half': np.int64(905),
#  'almost': np.int64(59),
#  'whole': np.int64(2372),
#  'second': np.int64(1813),
#  'like lt': np.int64(1174),
#  'fyi': np.int64(782),
#  'ride': np.int64(1756),
#  'morn': np.int64(1353),
#  'place': np.int64(1601),
#  'wow': np.int64(2433),
#  'never': np.int64(1405),
#  'realiz': np.int64(1707),
#  'embarass': np.int64(604),
#  'thought': np.int64(2117),
#  'sinc': np.int64(1878),
#  'best': np.int64(177),
#  'seem': np.int64(1823),
#  'happi': np.int64(912),
#  'sorri': np.int64(1926),
#  'give': np.int64(815),
#  'offer': np.int64(1466),
#  'ac': np.int64(5),
#  'new': np.int64(1406),
#  'red': np.int64(1717),
#  'play': np.int64(1604),
#  'ice': np.int64(997),
#  'correct': np.int64(442),
#  'end': np.int64(607),
#  'sm ac': np.int64(1900),
#  'yesterday': np.int64(2484),
#  'find': np.int64(701),
#  'congrat': np.int64(420),
#  'year': np.int64(2478),
#  'special': np.int64(1941),
#  'cinema': np.int64(356),
#  'pass': np.int64(1545),
#  'etc': np.int64(630),
#  'bx': np.int64(244),
#  'ip': np.int64(1031),
#  'pm': np.int64(1617),
#  'etc free': np.int64(631),
#  'bx ip': np.int64(245),
#  'ip pm': np.int64(1032),
#  'later': np.int64(1128),
#  'sorri call': np.int64(1927),
#  'call later': np.int64(265),
#  'later meet': np.int64(1129),
#  'reach': np.int64(1700),
#  'pick': np.int64(1581),
#  'move': np.int64(1360),
#  'pain': np.int64(1535),
#  'kill': np.int64(1091),
#  'way home': np.int64(2331),
#  'good': np.int64(849),
#  'girl': np.int64(813),
#  'situat': np.int64(1888),
#  'part': np.int64(1541),
#  'check': np.int64(343),
#  'iq': np.int64(1035),
#  'took': np.int64(2161),
#  'forev': np.int64(731),
#  'come': np.int64(400),
#  'ok come': np.int64(1477),
#  'doubl': np.int64(566),
#  'hair': np.int64(904),
#  'said': np.int64(1785),
#  'wun': np.int64(2446),
#  'cut': np.int64(480),
#  'short': np.int64(1862),
#  'nice': np.int64(1417),
#  'advis': np.int64(28),
#  'follow': np.int64(725),
#  'recent': np.int64(1715),
#  'review': np.int64(1754),
#  'mob': np.int64(1326),
#  'award': np.int64(132),
#  'bonu': np.int64(199),
#  'award bonu': np.int64(133),
#  'prize call': np.int64(1664),
#  'song': np.int64(1921),
#  'frnd': np.int64(774),
#  'rpli': np.int64(1773),
#  'send ur': np.int64(1837),
#  'complimentari': np.int64(412),
#  'trip': np.int64(2182),
#  'di': np.int64(533),
#  'ls': np.int64(1226),
#  'ur award': np.int64(2234),
#  'hear': np.int64(922),
#  'plane': np.int64(1603),
#  'lucki': np.int64(1234),
#  'save': np.int64(1798),
#  'money': np.int64(1347),
#  'hee': np.int64(927),
#  'hi': np.int64(937),
#  'babe': np.int64(140),
#  'im': np.int64(1005),
#  'wanna': np.int64(2310),
#  'someth': np.int64(1916),
#  'xx': np.int64(2458),
#  'hi babe': np.int64(938),
#  'call free': np.int64(259),
#  'that': np.int64(2103),
#  'cool': np.int64(438),
#  'respect': np.int64(1745),
#  'that cool': np.int64(2104),
#  'peopl': np.int64(1554),
#  'much': np.int64(1374),
#  'pa': np.int64(1529),
#  'oper': np.int64(1503),
#  'job': np.int64(1060),
#  'ta': np.int64(2059),
#  'ah': np.int64(41),
#  'hi hi': np.int64(940),
#  'stop': np.int64(1988),
#  'urgnt': np.int64(2260),
#  'real': np.int64(1705),
#  'yo': np.int64(2490),
#  'ticket': np.int64(2125),
#  'one': np.int64(1493),
#  'start': np.int64(1971),
#  'came': np.int64(286),
#  'bed': np.int64(167),
#  'coin': np.int64(386),
#  'gotta': np.int64(868),
#  'kano': np.int64(1075),
#  'il': np.int64(1003),
#  'download': np.int64(568),
#  'wen': np.int64(2359),
#  'wen ur': np.int64(2360),
#  'stand': np.int64(1967),
#  'close': np.int64(374),
#  'anoth': np.int64(73),
#  'night': np.int64(1420),
#  'spent': np.int64(1950),
#  'late': np.int64(1127),
#  'afternoon': np.int64(36),
#  'mean': np.int64(1278),
#  'moro': np.int64(1356),
#  'includ': np.int64(1014),
#  'smile': np.int64(1902),
#  'pleasur': np.int64(1611),
#  'troubl': np.int64(2183),
#  'rain': np.int64(1687),
#  'sum': np.int64(2028),
#  'hurt': np.int64(993),
#  'becoz': np.int64(166),
#  'someon': np.int64(1914),
#  'servic': np.int64(1844),
#  'repres': np.int64(1742),
#  'guarante': np.int64(887),
#  'call custom': np.int64(258),
#  'custom servic': np.int64(479),
#  'servic repres': np.int64(1846),
#  'repres pm': np.int64(1743),
#  'pm guarante': np.int64(1619),
#  'guarante cash': np.int64(889),
#  'cash prize': np.int64(315),
#  'havent': np.int64(920),
#  'plan': np.int64(1602),
#  'buy': np.int64(242),
#  'show': np.int64(1863),
#  'collect': np.int64(390),
#  'simpli': np.int64(1877),
#  'password': np.int64(1547),
#  'verifi': np.int64(2274),
#  'fml': np.int64(724),
#  'po': np.int64(1621),
#  'box': np.int64(213),
#  'free rington': np.int64(758),
#  'po box': np.int64(1622),
#  'box mk': np.int64(215),
#  'movi': np.int64(1361),
#  'abt': np.int64(4),
#  'wat abt': np.int64(2324),
#  'load': np.int64(1189),
#  'loan': np.int64(1190),
#  'wk': np.int64(2402),
#  'hol': np.int64(952),
#  'forgot': np.int64(733),
#  'appoint': np.int64(89),
#  'shower': np.int64(1866),
#  'caus': np.int64(319),
#  'prob': np.int64(1668),
#  'need get': np.int64(1397),
#  'get home': np.int64(802),
#  'coffe': np.int64(385),
#  'noth': np.int64(1447),
#  'els': np.int64(601),
#  'okay': np.int64(1487),
#  'price': np.int64(1657),
#  'long': np.int64(1201),
#  'ave': np.int64(127),
#  'gone': np.int64(846),
#  'drive': np.int64(576),
#  'test': np.int64(2087),
#  'yet': np.int64(2485),
#  'guess': np.int64(895),
#  'gave': np.int64(789),
#  'boston': np.int64(208),
#  'men': np.int64(1289),
#  'chang': np.int64(331),
#  'locat': np.int64(1192),
#  'nyc': np.int64(1462),
#  'cuz': np.int64(482),
#  'page': np.int64(1533),
#  'umma': np.int64(2216),
#  'life': np.int64(1166),
#  'lot': np.int64(1212),
#  'dear': np.int64(511),
#  'love lot': np.int64(1219),
#  'wish': np.int64(2392),
#  'birthday': np.int64(186),
#  'truli': np.int64(2185),
#  'aight': np.int64(45),
#  'hit': np.int64(943),
#  'would': np.int64(2431),
#  'address': np.int64(21),
#  'consid': np.int64(425),
#  'comput': np.int64(413),
#  'old': np.int64(1490),
#  'better': np.int64(179),
#  'lie': np.int64(1165),
#  'busi': np.int64(241),
#  'go dinner': np.int64(827),
#  'thing': np.int64(2107),
#  'scare': np.int64(1803),
#  'mah': np.int64(1244),
#  'contact': np.int64(426),
#  'last': np.int64(1122),
#  'draw': np.int64(569),
#  'hr': np.int64(983),
#  'ppm': np.int64(1644),
#  'tri contact': np.int64(2178),
#  'contact last': np.int64(430),
#  'last weekend': np.int64(1126),
#  'weekend draw': np.int64(2350),
#  'draw show': np.int64(571),
#  'show prize': np.int64(1864),
#  'prize guarante': np.int64(1666),
#  'guarante call': np.int64(888),
#  'code valid': np.int64(384),
#  'valid hr': np.int64(2271),
#  'hr ppm': np.int64(984),
#  'wa': np.int64(2298),
#  'anyway': np.int64(83),
#  'juz': np.int64(1073),
#  'tt': np.int64(2191),
#  'eatin': np.int64(593),
#  'weight': np.int64(2354),
#  'haha': np.int64(902),
#  'happen': np.int64(911),
#  'enter': np.int64(619),
#  'cabin': np.int64(247),
#  'boss': np.int64(206),
#  'felt': np.int64(688),
#  'askd': np.int64(111),
#  'apart': np.int64(85),
#  'went': np.int64(2361),
#  'enter cabin': np.int64(620),
#  'cabin pa': np.int64(248),
#  'pa said': np.int64(1530),
#  'said happi': np.int64(1786),
#  'happi day': np.int64(914),
#  'day boss': np.int64(500),
#  'boss felt': np.int64(207),
#  'felt special': np.int64(689),
#  'special askd': np.int64(1942),
#  'askd lunch': np.int64(112),
#  'lunch lunch': np.int64(1239),
#  'lunch invit': np.int64(1238),
#  'invit apart': np.int64(1028),
#  'apart went': np.int64(86),
#  'holiday': np.int64(956),
#  'flight': np.int64(720),
#  'inc': np.int64(1012),
#  'min': np.int64(1299),
#  'winner special': np.int64(2391),
#  'special select': np.int64(1945),
#  'select receiv': np.int64(1826),
#  'flight inc': np.int64(721),
#  'speak live': np.int64(1939),
#  'live oper': np.int64(1187),
#  'oper claim': np.int64(1504),
#  'must': np.int64(1381),
#  'friday': np.int64(767),
#  'hmm': np.int64(948),
#  'uncl': np.int64(2220),
#  'inform': np.int64(1019),
#  'school': np.int64(1805),
#  'directli': np.int64(546),
#  'food': np.int64(727),
#  'privat': np.int64(1661),
#  'account': np.int64(9),
#  'statement': np.int64(1974),
#  'identifi': np.int64(1000),
#  'expir': np.int64(657),
#  'privat account': np.int64(1662),
#  'account statement': np.int64(10),
#  'statement show': np.int64(1975),
#  'call identifi': np.int64(261),
#  'identifi code': np.int64(1001),
#  'code expir': np.int64(382),
#  'landlin': np.int64(1114),
#  'urgent mobil': np.int64(2257),
#  'mobil award': np.int64(1329),
#  'bonu caller': np.int64(200),
#  'caller prize': np.int64(281),
#  'contact call': np.int64(427),
#  'call landlin': np.int64(264),
#  'voda': np.int64(2289),
#  'number': np.int64(1455),
#  'match': np.int64(1264),
#  'quot': np.int64(1685),
#  'standard': np.int64(1968),
#  'app': np.int64(87),
#  'today voda': np.int64(2147),
#  'voda number': np.int64(2290),
#  'number end': np.int64(1457),
#  'end select': np.int64(609),
#  'receiv award': np.int64(1713),
#  'match pleas': np.int64(1265),
#  'call quot': np.int64(271),
#  'quot claim': np.int64(1686),
#  'code standard': np.int64(383),
#  'standard rate': np.int64(1969),
#  'mu': np.int64(1373),
#  'predict': np.int64(1650),
#  'wat time': np.int64(2325),
#  'time finish': np.int64(2130),
#  'yetund': np.int64(2487),
#  'sent': np.int64(1839),
#  'bother': np.int64(209),
#  'del': np.int64(521),
#  'bak': np.int64(148),
#  'long time': np.int64(1202),
#  'give call': np.int64(816),
#  'dear call': np.int64(512),
#  'answer': np.int64(75),
#  'sunshin': np.int64(2035),
#  'quiz': np.int64(1684),
#  'top': np.int64(2162),
#  'soni': np.int64(1922),
#  'dvd': np.int64(586),
#  'player': np.int64(1606),
#  'countri': np.int64(450),
#  'sp': np.int64(1935),
#  'sunshin quiz': np.int64(2036),
#  'wkli win': np.int64(2409),
#  'win top': np.int64(2385),
#  'soni dvd': np.int64(1923),
#  'know countri': np.int64(1100),
#  'sp tyron': np.int64(1936),
#  'laid': np.int64(1110),
#  'dog': np.int64(557),
#  'direct': np.int64(545),
#  'join': np.int64(1063),
#  'largest': np.int64(1120),
#  'bt': np.int64(230),
#  'txting': np.int64(2209),
#  'nt': np.int64(1449),
#  'ec': np.int64(594),
#  'want get': np.int64(2313),
#  'sent direct': np.int64(1840),
#  'ur mob': np.int64(2245),
#  'join uk': np.int64(1064),
#  'largest dog': np.int64(1121),
#  'haf': np.int64(901),
#  'yiju': np.int64(2488),
#  'befor': np.int64(171),
#  'activ': np.int64(16),
#  'chat': np.int64(338),
#  'age': np.int64(39),
#  'yr': np.int64(2494),
#  'go get': np.int64(828),
#  'lazi': np.int64(1140),
#  'type': np.int64(2212),
#  'lect': np.int64(1150),
#  'sir': np.int64(1883),
#  'mail': np.int64(1246),
#  'sir wait': np.int64(1884),
#  'swt': np.int64(2056),
#  'tire': np.int64(2134),
#  'littl': np.int64(1184),
#  'lovabl': np.int64(1213),
#  'person': np.int64(1565),
#  'coz': np.int64(454),
#  'heart': np.int64(924),
#  'gud': np.int64(890),
#  'ni': np.int64(1416),
#  'gud ni': np.int64(893),
#  'open': np.int64(1502),
#  'ya': np.int64(2463),
#  'ye see': np.int64(2474),
#  'see ya': np.int64(1822),
#  'what': np.int64(2365),
#  'sexi': np.int64(1851),
#  'local': np.int64(1191),
#  'luv': np.int64(1240),
#  'ltd': np.int64(1231),
#  'per msg': np.int64(1559),
#  'repli stop': np.int64(1739),
#  'stop end': np.int64(1991),
#  'begin': np.int64(173),
#  'qatar': np.int64(1679),
#  'pray': np.int64(1649),
#  'hard': np.int64(916),
#  'delet': np.int64(523),
#  'got job': np.int64(864),
#  'wine': np.int64(2389),
#  'thk': np.int64(2113),
#  'need go': np.int64(1398),
#  'window': np.int64(2388),
#  'shirt': np.int64(1858),
#  'sometim': np.int64(1919),
#  'dream': np.int64(573),
#  'without': np.int64(2400),
#  'joy': np.int64(1067),
#  'tv': np.int64(2196),
#  'becom': np.int64(165),
#  'leav': np.int64(1148),
#  'hous': np.int64(977),
#  'leav hous': np.int64(1149),
#  'interview': np.int64(1025),
#  'boy': np.int64(218),
#  'arrang': np.int64(102),
#  'new year': np.int64(1412),
#  'receiv cash': np.int64(1714),
#  'cash holiday': np.int64(314),
#  'keep': np.int64(1078),
#  'safe': np.int64(1784),
#  'everyon': np.int64(643),
#  'miss alreadi': np.int64(1316),
#  'parent': np.int64(1537),
#  'hand': np.int64(908),
#  'new job': np.int64(1407),
#  'excit': np.int64(651),
#  'spend': np.int64(1949),
#  'invit friend': np.int64(1029),
#  'friend repli': np.int64(770),
#  'see www': np.int64(1821),
#  'www sm': np.int64(2451),
#  'stop send': np.int64(1996),
#  'send stop': np.int64(1836),
#  'stop frnd': np.int64(1993),
#  'order': np.int64(1517),
#  'content': np.int64(432),
#  'goto': np.int64(867),
#  'internet': np.int64(1024),
#  'menu': np.int64(1290),
#  'ur mobil': np.int64(2246),
#  'urgent tri': np.int64(2259),
#  'avoid': np.int64(128),
#  'wit': np.int64(2397),
#  'escap': np.int64(628),
#  'fanci': np.int64(671),
#  'complet': np.int64(411),
#  'form': np.int64(735),
#  'also': np.int64(64),
#  'wast': np.int64(2322),
#  'bank': np.int64(151),
#  'hmmm': np.int64(949),
#  'hop': np.int64(963),
#  'muz': np.int64(1382),
#  'discuss': np.int64(550),
#  'liao': np.int64(1162),
#  'time come': np.int64(2129),
#  'hell': np.int64(928),
#  'cant': np.int64(296),
#  'believ': np.int64(175),
#  'mr': np.int64(1364),
#  'ill': np.int64(1004),
#  'bath': np.int64(155),
#  'carlo': np.int64(306),
#  'got money': np.int64(865),
#  'stay': np.int64(1977),
#  'til': np.int64(2126),
#  'smoke': np.int64(1903),
#  'worth': np.int64(2428),
#  'doesnt': np.int64(556),
#  'log': np.int64(1193),
#  'spoke': np.int64(1954),
#  'satisfi': np.int64(1793),
#  'experi': np.int64(656),
#  'lift': np.int64(1167),
#  'especi': np.int64(629),
#  'studi': np.int64(2012),
#  'gr': np.int64(869),
#  'trust': np.int64(2186),
#  'guy': np.int64(897),
#  'bye': np.int64(246),
#  'toward': np.int64(2169),
#  'boytoy': np.int64(219),
#  'get back': np.int64(800),
#  'awesom': np.int64(139),
#  'minut': np.int64(1309),
#  'freephon': np.int64(763),
#  'freephon pm': np.int64(764),
#  'xma': np.int64(2457),
#  'ju': np.int64(1070),
#  'si': np.int64(1868),
#  'reach home': np.int64(1702),
#  'th': np.int64(2096),
#  'co uk': np.int64(380),
#  'touch': np.int64(2166),
#  'deal': np.int64(510),
#  'keep touch': np.int64(1082),
#  'cours': np.int64(452),
#  'howev': np.int64(980),
#  'suggest': np.int64(2025),
#  'abl': np.int64(3),
#  'everi': np.int64(637),
#  'settl': np.int64(1849),
#  'gr day': np.int64(870),
#  'mrng': np.int64(1365),
#  'hav': np.int64(919),
#  'gud mrng': np.int64(892),
#  'nice day': np.int64(1418),
#  'stori': np.int64(2004),
#  'person stori': np.int64(1568),
#  'dead': np.int64(509),
#  'tmr': np.int64(2137),
#  'orchard': np.int64(1516),
#  'mrt': np.int64(1366),
#  'kate': np.int64(1076),
#  'see tomorrow': np.int64(1820),
#  'found': np.int64(739),
#  'buck': np.int64(235),
#  'gt buck': np.int64(879),
#  'darlin': np.int64(492),
#  'ive': np.int64(1042),
#  'colleg': np.int64(393),
#  'success': np.int64(2022),
#  'decim': np.int64(516),
#  'balanc': np.int64(149),
#  'rs': np.int64(1774),
#  'transact': np.int64(2173),
#  'id': np.int64(998),
#  'lt decim': np.int64(1228),
#  'decim gt': np.int64(517),
#  'rs lt': np.int64(1775),
#  'goodmorn': np.int64(858),
#  'sleep': np.int64(1894),
#  'ga': np.int64(783),
#  'dat': np.int64(495),
#  'oso': np.int64(1523),
#  'cannot': np.int64(295),
#  'oredi': np.int64(1519),
#  'like dat': np.int64(1172),
#  'straight': np.int64(2007),
#  'connect': np.int64(424),
#  'bill': np.int64(183),
#  'give us': np.int64(817),
#  'big': np.int64(182),
#  'readi': np.int64(1704),
#  'get readi': np.int64(803),
#  'break': np.int64(222),
#  'semest': np.int64(1829),
#  'hope great': np.int64(967),
#  'noe': np.int64(1427),
#  'leh': np.int64(1153),
#  'sound': np.int64(1930),
#  'head': np.int64(921),
#  'slept': np.int64(1896),
#  'past': np.int64(1548),
#  'easi': np.int64(590),
#  'sen': np.int64(1830),
#  'exam': np.int64(648),
#  'march': np.int64(1260),
#  'atm': np.int64(118),
#  'regist': np.int64(1723),
#  'ok prob': np.int64(1481),
#  'import': np.int64(1010),
#  'file': np.int64(695),
#  'system': np.int64(2058),
#  'shop': np.int64(1860),
#  'romant': np.int64(1766),
#  'nite': np.int64(1425),
#  'tc': np.int64(2069),
#  'biz': np.int64(190),
#  'optout': np.int64(1511),
#  'gbp': np.int64(793),
#  'mtmsg': np.int64(1372),
#  'new mobil': np.int64(1408),
#  'txt nokia': np.int64(2203),
#  'appreci': np.int64(90),
#  'partner': np.int64(1544),
#  'career': np.int64(305),
#  'star': np.int64(1970),
#  'sign': np.int64(1873),
#  'compani': np.int64(409),
#  'bcoz': np.int64(159),
#  'teach': np.int64(2072),
#  'good morn': np.int64(855),
#  'walk': np.int64(2303),
#  'cross': np.int64(463),
#  'road': np.int64(1763),
#  'side': np.int64(1870),
#  'street': np.int64(2008),
#  'batteri': np.int64(156),
#  'die': np.int64(536),
#  'work place': np.int64(2421),
#  'flirt': np.int64(722),
#  'sam': np.int64(1791),
#  'print': np.int64(1660),
#  'bu stop': np.int64(234),
#  'wil': np.int64(2379),
#  'argument': np.int64(97),
#  'lose': np.int64(1210),
#  'argu': np.int64(96),
#  'kick': np.int64(1088),
#  'person dont': np.int64(1566),
#  'ur friend': np.int64(2237),
#  'secret': np.int64(1814),
#  'admir': np.int64(22),
#  'reveal': np.int64(1752),
#  'secret admir': np.int64(1815),
#  'admir look': np.int64(23),
#  'look make': np.int64(1206),
#  'make contact': np.int64(1249),
#  'contact find': np.int64(429),
#  'find reveal': np.int64(704),
#  'reveal think': np.int64(1753),
#  'think ur': np.int64(2111),
#  'ur special': np.int64(2250),
#  'special call': np.int64(1943),
#  'laptop': np.int64(1118),
#  'case': np.int64(308),
#  'tel': np.int64(2077),
#  'meant': np.int64(1279),
#  'told': np.int64(2149),
#  'face': np.int64(664),
#  'fr': np.int64(740),
#  'thanx': np.int64(2102),
#  'everyth': np.int64(644),
#  'websit': np.int64(2339),
#  'kalli': np.int64(1074),
#  'didnt': np.int64(534),
#  'goodnight': np.int64(860),
#  'fix': np.int64(714),
#  'wake': np.int64(2302),
#  'oh ok': np.int64(1474),
#  'good night': np.int64(856),
#  'congratul': np.int64(422),
#  'cd': np.int64(322),
#  'voucher': np.int64(2295),
#  'gift': np.int64(810),
#  'music': np.int64(1378),
#  'tnc': np.int64(2138),
#  'ldew': np.int64(1141),
#  'ppmx': np.int64(1645),
#  'congratul ur': np.int64(423),
#  'gift guarante': np.int64(811),
#  'wkli draw': np.int64(2408),
#  'draw txt': np.int64(572),
#  'txt music': np.int64(2202),
#  'music tnc': np.int64(1380),
#  'tnc www': np.int64(2139),
#  'www ldew': np.int64(2450),
#  'ldew com': np.int64(1142),
#  'com win': np.int64(399),
#  'win ppmx': np.int64(2384),
#  'ppmx age': np.int64(1646),
#  'cal': np.int64(252),
#  'hold': np.int64(953),
#  'bcum': np.int64(160),
#  'angri': np.int64(71),
#  'wid': np.int64(2373),
#  'dnt': np.int64(553),
#  'childish': np.int64(348),
#  'true': np.int64(2184),
#  'deep': np.int64(519),
#  'affect': np.int64(31),
#  ...}

# Output Feature which is label 
y= pd.get_dummies(messages['label'])
y
# | Index | ham   | spam  |
# | ----- | ----- | ----- |
# | 0     | True  | False |
# | 1     | True  | False |
# | 2     | False | True  |
# | 3     | True  | False |
# | 4     | True  | False |
# | ...   | ...   | ...   |
# | 5567  | False | True  |
# | 5568  | True  | False |
# | 5569  | True  | False |
# | 5570  | True  | False |
# | 5571  | True  | False |

# y.iloc[:,0].values means we are selecting the first column of the DataFrame `y`,
#  which corresponds to the 'ham' label. The output is a NumPy array of boolean values indicating
#  whether each message is classified as 'ham' (True) or 'spam' (False).
y = y.iloc[:,0].values
# array([ True,  True, False,  True,  True, False,  True,  True, False, False,  True, False, False,  True,  True, False,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True, ...,  True,  True,  True,  True,  True, False,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True, False, False,  True,  True,  True,  True], shape=(5572,))

# Splitting Data into Training and Testing Sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.naive_bayes import MultinomialNB
spam_detect_Model = MultinomialNB()
# Training the Model
spam_detect_Model.fit(X_train, y_train)
# Making Predictions
y_pred = spam_detect_Model.predict(X_test)
y_pred
# array([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True,  True, False,  True,  True, False,  True,  True,  True, False,  True,  True, False, False,  True,  True,  True,  True, ...,  True, False,  True,  True,  True,  True, False,  True,  True, False,  True,  True,  True,  True,  True,  True,  True, False,  True,  True, False,  True, False,  True,  True,  True,  True,  True,  True,  True], shape=(1115,))
y.shape
# (5572,)

#Testing the Model
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
# 0.9865470852017937

from sklearn.metrics import classification_report
print("Classification report", classification_report(y_test, y_pred))

# 📊 Classification Report
# | Label     | precision | recall | f1-score | support |
# | --------- | --------- | ------ | -------- | ------- |
# | **False** | 0.96      | 0.94   | 0.95     | 149     |
# | **True**  | 0.99      | 0.99   | 0.99     | 966     |

# 🧮 Overall Metrics
# | Metric           | Score                                         |
# | ---------------- | --------------------------------------------- |
# | **accuracy**     | 0.99                                          |
# | **macro avg**    | Precision: 0.97, Recall: 0.97, F1-Score: 0.97 |
# | **weighted avg** | Precision: 0.99, Recall: 0.99, F1-Score: 0.99 |

from sklearn.feature_extraction.text import TfidfVectorizer
# Initializing the TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, stop_words='english', lowercase=True, ngram_range=(1, 2))
X = tfidf.fit_transform(corpus).toarray()
X
# array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.355, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0.251, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.376, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0.318, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.346, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0.31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.281, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0.266, 0.273, 0.247, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.434, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.306, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.29, 0, 0, 0, 0, 0, 0, 0, 0.224, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.677, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0.32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.723, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.404, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0.351, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        ...,
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.478, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.542, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0.198, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.609, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
#        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], shape=(5572, 1000))

# Train test Split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
# Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
spam_tfidf_Model = MultinomialNB().fit(X_train,y_train)
#predictions
y_predict = spam_tfidf_Model.predict(X_test)
# Model Accuracy
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_predict))
#Accuracy: 0.97847533632287
print("classification Report",classification_report(y_test, y_predict))
#                Classification Report
# -------------------------------------------------
#               | precision | recall | f1-score | support
# -------------------------------------------------
#      False    |   0.96    |  0.87  |   0.92   |   149
#      True     |   0.98    |  0.99  |   0.99   |   966
# -------------------------------------------------
#   accuracy    |                   0.98         |  1115
# macro average |   0.97    |  0.93  |   0.95   |  1115
# weighted avg  |   0.98    |  0.98  |   0.98   |  1115





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification report               precision    recall  f1-score   support

       False       0.96      0.94      0.95       149
        True       0.99      0.99      0.99       966

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Accuracy: 0.97847533632287
classification Report               precision    recall  f1-score   support

       False       0.96      0.87      0.92       149
        True       0.98      0.99      0.99       966

    accuracy                           0.98      1115
   macro avg       0.97      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

