In [None]:
import os
os.getcwd()

### Gensim
1. It is a python library which provides a built-in models like  ready-to-use implementation of Word2Vec —you don’t have to build the neural network from scratch.
2. We can try the already built-in models like Word2Vec, GloVe, or FastText using gensim.downloader.api.load()
3. If we want to train our own embeddings we can do that - from gensim.models import Word2Vec
4. There are different word2vec models. If we want to create our own embedding model we need to import the Word2Vec class.

In [None]:
pip install gensim

#### Let's see how we can load the google-news word2vec model

In [None]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [None]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

In [None]:
# Let's see how the model give an output for a word
# Google news word2vec model has 300 dimensions and it is trained on 3 million words/phrases. 300 dimensions means each word is represented as a vector of 300 elements.
# Let's see the vector for the word 'king'
vector = model['king']
print(vector)

In [None]:
vector_length = len(vector)
print(f"Length of the vector for the word 'king': {vector_length}")

#### Let's create our own Word2Vec model.

In [None]:
# Read the data from the data folder
import pandas as pd
file_path = os.path.join('..', 'data', 'SMSSpamCollection.txt')
messages=pd.read_csv(file_path, sep='\t', names=['label', 'message'])

In [None]:
messages

In [None]:
messages.shape

In [None]:
messages.head()

In [None]:
# Lets do the data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
ps = PorterStemmer()

In [None]:
# This prepares text for CountVectorizer or TfidfVectorizer, which expect string sentences, not token lists.
# But for Word2Vec, we need list of token lists. That is done in the next cells.
corpus=[]
for i in range(0,len(messages)):
    # [^a-zA-Z] → “anything not (^) an uppercase (A–Z) or lowercase (a–z) letter”.THis this removes all digits (0–9), punctuation, symbols, etc. Replace them with spaces from message column
    # If we need numbers we can use [^a-zA-Z0-9]
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    # Split the sentences in to words by spaces
    review = review.split()
    # Apply the stemming
    review = [lemmatizer.lemmatize(word) for word in review]
    # Join the words to make sentences
    # If review is empty → ' '.join([]) → '' (an empty string).
    review = ' '.join(review)
    # Append the snetence to corpus
    corpus.append(review)

#### Meaning of the below list comprehensino
| Expression          | Meaning                                                         |
| ------------------- | --------------------------------------------------------------- |
| `map(len, corpus)`  | Gets length of each cleaned message                             |
| `zip(...)`          | Pairs lengths, cleaned messages, and original messages together |
| `if i < 1`          | Filters messages with empty cleaned text                        |
| `[ [i, j, k] ... ]` | Builds list showing length, cleaned text, and original text     |


In [None]:
# Let's see if there are any empty messages in the corpus. 
# This helps in identifying the messages became empty after cleaning (e.g., removing stopwords, punctuation, etc.)
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

#### empty strings '' can appear in corpus if the original message has no letters.

In [None]:
# To get the complete details which original messages became empty after cleaning
for i, j, k in zip(list(map(len, corpus)), corpus, messages['message']):
    if i < 1:
        print(f"Original: {k}")
        print(f"Cleaned: '{j}'")
        print(f"Length: {i}\n")
        


In [None]:
corpus

#### The 3 empty strings with white spaces that are after cleaning of the corpus will not be considered in the words. So total size will be reduced from 5572 to 5569 sentences.

##### How the below logic works?
| Case    | Will it be added to `words`?                             | Reason                                        |
| ------- | -------------------------------------------------------- | --------------------------------------------- |
| `''`    | ❌ No                                                     | `sent_tokenize` → `[]`                        |
| `'   '` | ❌ No (inner loop skips) or empty list if passed directly | No valid sentence/tokens                      |
| `'u'`   | ✅ Added as `[]` by default (filtered)                    | Single char → filtered out unless `min_len=1` |
| `'hi'`  | ✅ Added as `['hi']`                                      | Valid token                                   |



In [None]:
# Convert each cleaned sentence into a list of tokenized words for Word2Vec training.
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [None]:
words

##### Below are the steps to check the empty strings in words list.

In [None]:
any(len(w) == 0 for w in words)

In [None]:
# Count empties
sum(len(w)==0 for w in words)

In [None]:
# Check the indexes of the empty words
[i for i,w in enumerate(words) if len(w)==0]

In [None]:
# This gives the 3 empty strings with white spaces from the corpus which I mentioned previously.
[i for i, x in enumerate(corpus) if x.strip() == '']

In [None]:
print(len(corpus))
print(len(words))

In [None]:
for i in [4291, 5170]:
    print(f"Index {i}: {repr(corpus[i])}")

In [None]:
words[4291]

In [None]:
words[5170]

In [None]:
# When you skip empty sentences in your corpus → words loop (like if len(tokens) > 0),you are not preserving one-to-one alignment between corpus and words. 
# So if words[4291] and words[5170] are empty, that tells us: these indices in words do not correspond to the same indices in corpus anymore.
# If you want to check which corpus entries created empty words,you must loop together like this:
empty_indices = []
for i, sent in enumerate(corpus):
    sent_token = sent_tokenize(sent)
    for s in sent_token:
        tokens = simple_preprocess(s)
        if len(tokens) == 0:
            empty_indices.append(i)


In [None]:
empty_indices

In [None]:
for i in [4293, 5173]:
    print(f"Index {i}: {repr(corpus[i])}")

##### If you see in the above they are very short messages 'g w r' and 'u'. Each contains a one or a few single-letter “words”. 
What simple_preprocess() does:

gensim.utils.simple_preprocess() is not just a basic split —
1. it removes very short tokens by default.
2. By design, it ignores tokens that:  

Are shorter than 2 characters (default min_len=2)  

Or longer than 15 characters (default max_len=15)  


In [None]:
# we get 5569 because 3 messages became empty after cleaning.
len(words)

In [None]:
# Let's train our own Word2Vec model.
import gensim
# By default,min_count is 5, so words which are having frequency less than 5 will be ignored.
model=gensim.models.Word2Vec(words) # we can mention parameters like vector size, window size, min count etc. By default vector size is 100, window size is 5 and min count is 5.

#### 🧩 1️⃣ “100 dimensions” — are those also the words in my vocabulary?
1. ❌ No — the 100 dimensions are not words. They are number of numeric features that capture relationships (like gender, tense, topic, etc.)
2. They are numerical features (latent semantic dimensions) that represent meaning or context of words — not actual words themselves.
3. These 100 numbers don’t correspond to specific words.Instead, they describe abstract properties — like:

a. masculine/feminine axis   
b. royalty/commoner axis  
c. age, emotion, topic, etc.  

The model learns these patterns automatically while training.



In [None]:
# It will give the count of the words in the vocabulary and their dimension(means no.of columns)
# since we have not mentioned any parameters, by default vector size is 100. so number of columns is 100.
model.wv.vectors.shape 

In [None]:
# To get all the vocabulary words in the model
# Since I mentioned min_count as 5, words which are having frequency less than 5 are ignored.
model.wv.index_to_key

#### 🧩 2️⃣ Should model.wv.vectors.shape and model.corpus_count be the same?
❌ No, they are not the same thing — and they almost never match.
| Attribute                   | Meaning                                                    |
| --------------------------- | ---------------------------------------------------------- |
| `model.wv.vectors.shape[0]` | Number of **unique words in the vocabulary** (rows)        |
| `model.corpus_count`        | Number of **sentences** (or “documents”) used for training |

🧠 Analogy

Think of Word2Vec as a language school:  

corpus_count → how many sentences it studied.  

wv.vectors.shape[0] → how many unique words it learned from them.  

wv.vectors.shape[1] → how many traits each word has learned (like tone, tense, meaning).  


In [None]:
# It indicates the number of sentences (or “documents”) used for training
model.corpus_count

In [None]:
# model.epochs means it tells how many times your Word2Vec model iterated over the entire training corpus during training
model.epochs

In [None]:
# Check the similar words
model.wv.most_similar('good')

In [None]:
model.wv['good'].shape

In [None]:
words[0]

##### What is model.wv in Word2Vec?
| Part       | Description                                                             |
| ---------- | ----------------------------------------------------------------------- |
| `model.wv` | **Word vectors** — this is your vocabulary + learned embeddings         |
| `model`    | Full model (includes training settings, negative sampling tables, etc.) |

✅ So yes — model.wv is the learned vocabulary of your Word2Vec model.
1. It stores all unique words that appeared in your training corpus (subject to min_count).
2. It also stores the vector representation for each of those words.

##### How we can get the NaNs?
If a sentence has no valid words in model.wv, like "12345", "!!!", "###", etc.
Then the list vectors becomes empty: vectors=[]  
and   

np.mean(vector,axis=0) returns nan.



In [None]:
# Since we have 100 dimesions for every word of word[0] similarly for all words in the words list.We need to take average of all word vectors to represent the entire sentence.
# model.wv- 
import numpy as np
def avg_word2vec(words):
    # Iterates through each word in the sentence and retrieves its corresponding word vector from the Word2Vec model (model.wv).Collects these vectors into a list called vectors.
    vectors=[model.wv[word] for word in words if word in model.wv]
    if len(vectors)>0:
       return np.mean(vectors, axis=0)
    else:
       # Suppose if the words in the sentence are not present in the vocabulary of the model i.e model.wv, then the list vectors becomes empty and if we do np.mean(vectors, axis=0) it returns a Nan value. To avoid that we can return a zero vector of same dimension as the model's word vectors.
       return np.zeros(model.vector_size) # Handle the case where no words are found in the model's vocabulary

In [None]:
pip install tqdm

##### tqdm:
tqdm is a python library for progress bars. It shows you a real-time progress indicator in the console or notebook while loops are running.


In [None]:
from tqdm import tqdm

In [None]:
# Apply the avg_word2vec function to each sentence in the words list to get the average word vectors for all sentences.
import numpy as np
# X is a list of vectors
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

In [None]:
len(X)

In [None]:
type(X)

In [None]:
len(X)  # number of sentences

In [None]:
# Independent features. 
# X is a list of vectors. Not compatible with ML models. so convert it into a 2D NumPy array.
X_new=np.array(X)

In [None]:
messages.shape

In [None]:
X_new.shape

In [None]:
X_new[0].shape

In [None]:
y= messages['label'].map({'ham': 0, 'spam': 1}).values

In [None]:
len(y)

#### Execution of the below code:
| Step | Expression                          | What It Does                                                    | Result Type  |
| ---- | ----------------------------------- | --------------------------------------------------------------- | ------------ |
| 1    | `map(lambda x: len(x) > 0, corpus)` | Checks which corpus entries are non-empty                       | list of bool |
| 2    | `messages[...]`                     | Filters DataFrame to keep only rows with non-empty cleaned text | DataFrame    |
| 3    | `y['label']`                        | Selects the label column                                        | Series       |
| 4    | `.map({'ham': 0, 'spam': 1})`       | Converts text labels to numbers                                 | Series (int) |
| 5    | `.values`                           | Converts Series → NumPy array                                   | ndarray      |


In [None]:
# y is considering all the sentences but after cleaning like applying the regular expression '[^a-zA-Z]' we got the total 5569 sentences lets add that filter to y.
# messages is the dataframe with 2 columns message and label and y stores the updated dataframe  where the corresponding corpus entry was non-empty.
y=messages[list(map(lambda x:len(x)>0, corpus))] # This selects only the rows where the corresponding corpus entry was non-empty.
# From the filtered DataFrame y, we take the label column and replaces the string labels using a dictionary: 'ham' → 0 and 'spam' → 1
# .values - this converts the pandas series to a numpy array.
y= y['label'].map({'ham': 0, 'spam': 1}).values          


In [None]:
len(y)

#### Create Bag Of Words

In [None]:
# Output feature
y = messages['label'].map({'ham': 0, 'spam': 1}).values
y

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.20, random_state=42)

In [None]:
# Create the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
# For binary BOW model use binary=True in CountVectorizer and for limiting the features use max_features and ngram_range parameter is used to consider more than one words together
cv = CountVectorizer(max_features=2500,ngram_range=(1,2))

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
X_train

In [None]:
y_train

#### 
When you used BOW or TF-IDF:

You had a cleaned corpus → tokenized text.

You directly fed it into CountVectorizer or TfidfVectorizer.

These vectorizers automatically handle:

Vocabulary building

Conversion of text → numerical matrix (sparse matrix)

Aligning train and test data into same feature space

In [None]:
# Now we will fit and transform the X_train data and transform the X_test data
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [None]:
# We get the vocabulary of the BOW model and the index of each word in the feature vector
cv.vocabulary_

In [None]:
# Import the model
from sklearn.naive_bayes import MultinomialNB
MultinomialNB = MultinomialNB()


In [None]:
# Train the model
spam_detect_model=MultinomialNB.fit(X_train,y_train)

In [None]:
# Predict the labels for test data
y_pred=spam_detect_model.predict(X_test)

In [None]:
# Model Evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
confusion_mtx = confusion_matrix(y_test, y_pred)
print(f"confusion matrix\n: {confusion_mtx}")
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy : {accuracy}")
precision = precision_score(y_test, y_pred)
print(f" Precision : {precision}")
recall = recall_score(y_test, y_pred)
print(f" Recall : {recall}")
f1 = f1_score(y_test, y_pred)
print(f" F1 Score : {f1}")