In [2]:
!pip install gensim

Collecting gensim
  Using cached gensim-4.3.3-cp310-cp310-win_amd64.whl.metadata (8.2 kB)
Using cached gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.3




In [None]:
#⚙️ Installing Required Libraries
%pip install gensim


# 📦 Import Libraries
 
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import re
import nltk
import numpy as np
from tqdm import tqdm

# 📌 Make sure to download NLTK data:

nltk.download('punkt')
nltk.download('wordnet')

# ✏️ Preprocessing the Text
# ✔️ Use Lemmatization instead of Stemming
# ❌ Don’t remove stop words (to test vector generation for every word)

 
lemmatizer = WordNetLemmatizer()
corpus = []

# Iterate over the actual message texts, not the DataFrame itself
for msg in messages['message']:
    msg = re.sub('[^a-zA-Z]', ' ', msg)
    msg = msg.lower()
    msg = msg.split()
    msg = [lemmatizer.lemmatize(word) for word in msg]
    corpus.append(' '.join(msg))

print(corpus[:2])  # Show first 2 preprocessed messages

# 🧹 Tokenize Sentences and Words Using simple_preprocess
# 📌 simple_preprocess() converts text to lowercase, removes short/long words

# 🔄 Loop over sentences and tokenize them

 
words = []
for sentence in corpus:
    for sent in sent_tokenize(sentence):
        words.append(simple_preprocess(sent))
print(words) # [['label'], ['message']]

# 🧠 Train Word2Vec from Scratch
 
model = Word2Vec(
    sentences=words,
    vector_size=100,  # each word becomes a 100-dimensional vector
    window=5,
    min_count=1
)
#gives all the words (vocabulary)
print(model.wv.key_to_index) # {'label': 0, 'message': 1}
# gives the index of each word in the vocabulary
print(model.wv.index_to_key) # ['label', 'message']

# 📌 Check vocabulary size:
print(model.corpus_count)
# 2


# 📌 Vector for word “good”:
# Print vector for a word that exists in the vocabulary, e.g., 'go'
print(model.wv['go'])  # vector for 'go'
# Example output: array([...])

# 🧮 Average Word2Vec Function
def avg_word2vec(doc, model):
    # Get all word vectors for words in vocab
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector if no known words
        return np.zeros(model.vector_size)

# ⚙️ Convert All Sentences to Vectors
X = []

for sentence in tqdm(words):
    X.append(avg_word2vec(sentence, model))

# 🔎 Now X is a list of 100-dimensional sentence vectors
X_new = np.vstack(X)
print(X_new.shape)  # Output: (number_of_sentences, 100)

# 🔢 First sentence vector:

print(X_new[0])
print(X_new[0].shape)  # Output: (100,)

# 🎯 Target Variable (Output / Labels)
# 📌 Assume y contains the output labels like 1 for spam and 0 for ham

# Example dataset
y = messages['label']  # Example: 'spam' or 'ham'
# Align y with X_new
y = y.iloc[:len(X_new)].reset_index(drop=True)
y
# 0        ham
# 1        ham
# 2       spam
# 3        ham
# 4        ham
#         ... 
# 5567    spam
# 5568     ham
# 5569     ham
# 5570     ham
# 5571     ham
# Name: label, Length: 5572, dtype: object

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 🧪 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# 🤖 Train Logistic Regression
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

# 📈 Predict
y_pred = model_lr.predict(X_test)

# ✅ Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
# Accuracy: 0.8608617594254937

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sahus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sahus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat', 'ok lar joking wif u oni']
5569
[-0.3540085   0.5668885   0.33319598  0.10853042  0.34147796 -1.4479635
  0.48437807  1.44431    -0.5179593  -0.30191317 -0.5050502  -1.2364985
 -0.25691894  0.24994606  0.29288828 -0.4270051   0.03475523 -0.96510524
  0.00510019 -1.3603725   0.85502625  0.36961496  0.6264754  -0.45740116
 -0.06712193  0.26959568 -0.6210778  -0.2962576  -0.6169995  -0.00247167
  0.78616357 -0.03235633  0.32163328 -0.79864216 -0.2240598   0.70303226
 -0.03828476 -0.4573394  -0.48751938 -1.071856    0.28908253 -0.5087238
 -0.8167884   0.24906133  0.38966262 -0.2133913  -0.45362142 -0.19963275
  0.30879605  0.48122925  0.4595577  -0.67327994 -0.20983392 -0.14575629
 -0.46166423  0.2643227   0.29825243 -0.11015698 -0.7841294   0.37227923
  0.1949245   0.0176666   0.01456662  0.00322007 -0.5850657   0.82764894
  0.27049577  0.77612925 -1.0517025   1.0160667  -0.70023674 

100%|██████████| 5569/5569 [00:01<00:00, 5257.67it/s]

(5569, 100)
[-0.15430196  0.26905397  0.14129858  0.05302018  0.14662583 -0.64586079
  0.21779943  0.64853978 -0.23578474 -0.13678972 -0.2249057  -0.56048787
 -0.11315049  0.11277078  0.13334414 -0.19710308 -0.00232021 -0.43405256
  0.00383473 -0.60233235  0.39621189  0.17002256  0.27098566 -0.2017846
 -0.03338306  0.11456402 -0.26844823 -0.12848336 -0.27415302 -0.00979146
  0.34254381 -0.0135604   0.14537956 -0.36166683 -0.09539386  0.32493353
 -0.00500245 -0.2001639  -0.23147948 -0.4755463   0.11162233 -0.22452582
 -0.37382326  0.10077342  0.18028516 -0.094125   -0.20822495 -0.08312199
  0.14331204  0.21980548  0.22133939 -0.30393863 -0.1081342  -0.08570249
 -0.2050605   0.1183987   0.1375836  -0.05024294 -0.35131371  0.15802689
  0.07201726  0.01267604 -0.0037381  -0.02053454 -0.25956768  0.36930576
  0.12347971  0.3593353  -0.47746897  0.44457749 -0.3069551   0.23310536
  0.31084874 -0.17580107  0.41486934  0.09742448 -0.11419889 -0.04557159
 -0.10784844  0.02171759 -0.24834315 -0.


