# **Advanced Word Embedding Benchmarking on Noisy Real-World Corpus**
- ### **Multi-Class Newsgroups Dataset**

In [1]:
!pip install datasets



In [2]:
import spacy
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
from collections import Counter

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m162.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [32]:
from datasets import load_dataset

dataset = load_dataset("SetFit/20_newsgroups")

X_train = dataset['train']['text']
y_train = dataset['train']['label']
X_test = dataset['test']['text']
y_test = dataset['test']['label']

len(X_train), len(y_train), len(X_test), len(y_test)
len(set(y_train)), len(set(y_test))
print(set(y_train))


Repo card metadata block was not found. Setting CardData to empty.


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}


In [4]:
from collections import Counter

print("Train set class distribution:")
for label, count in sorted(Counter(y_train).items()):
    print(f"Class {label}: {count} samples")

print("\nTest set class distribution:")
for label, count in sorted(Counter(y_test).items()):
    print(f"Class {label}: {count} samples")

Train set class distribution:
Class 0: 480 samples
Class 1: 584 samples
Class 2: 591 samples
Class 3: 590 samples
Class 4: 578 samples
Class 5: 593 samples
Class 6: 585 samples
Class 7: 594 samples
Class 8: 598 samples
Class 9: 597 samples
Class 10: 600 samples
Class 11: 595 samples
Class 12: 591 samples
Class 13: 594 samples
Class 14: 593 samples
Class 15: 599 samples
Class 16: 546 samples
Class 17: 564 samples
Class 18: 465 samples
Class 19: 377 samples

Test set class distribution:
Class 0: 480 samples
Class 1: 584 samples
Class 2: 591 samples
Class 3: 590 samples
Class 4: 578 samples
Class 5: 593 samples
Class 6: 585 samples
Class 7: 594 samples
Class 8: 598 samples
Class 9: 597 samples
Class 10: 600 samples
Class 11: 595 samples
Class 12: 591 samples
Class 13: 594 samples
Class 14: 593 samples
Class 15: 599 samples
Class 16: 546 samples
Class 17: 564 samples
Class 18: 465 samples
Class 19: 377 samples


> ## Step 1 — Preprocessing




In [5]:
def preprocessing(text,
                  lowercase = True,
                  remove_stopword = False,
                  remove_punct = False,
                  lemmatized = False,
                  remove_metadata = False
                  ):
   preprocessed_text = []
   freq = Counter()

   current_texts = text
   if remove_metadata:
         # Assuming 'text' is an iterable of strings if remove_metadata is True
         current_texts = [t.split("\n\n", 1)[-1] for t in text]

   docs = nlp.pipe(current_texts, batch_size=32) # Corrected indentation

   for doc in docs:
     doc_tokens = [] # Renamed to avoid shadowing
     for token in doc:

       if remove_stopword and token.is_stop:
         continue
       if remove_punct and token.is_punct:
         continue

       if not token.is_alpha:
         continue

       word = token.lemma_ if lemmatized else token.text # Corrected 'word' to 'token'

       if lowercase:
         word = word.lower()

       doc_tokens.append(word)
       freq[word] += 1            # Moved inside the token loop for correct frequency counting

     preprocessed_text.append(doc_tokens)

   return preprocessed_text, freq

In [6]:
X, frequency = preprocessing(X_train, lowercase=True, remove_stopword=True, remove_punct=True, lemmatized=True, remove_metadata=True)


NameError: name 'x' is not defined

In [7]:
print(X[0:5])
print(frequency[1])

0


> ## Step 2 — Second Pass: Remove Rare Words

In [8]:
def remove_rare_words(tokenized_docs, freq_dict, min_threshold=5):

    filtered_docs = []

    for doc in tokenized_docs:
        filtered_doc = [
            word for word in doc
            if freq_dict[word] >= min_threshold
        ]
        filtered_docs.append(filtered_doc)

    return filtered_docs

In [9]:
X_final = remove_rare_words(X, frequency, min_threshold=5)
print(X_final[0:10])



> ## Step 3 — Train All Embedding Models

In [10]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [11]:
from gensim.models import Word2Vec, FastText
import os

def embedding_train(text, model_name):

  os.makedirs('models', exist_ok=True) # Ensure 'models' directory exists
  save_path = f'models/{model_name}'
  model = None # Initialize a variable to hold the trained model

  if model_name == 'word2vec-cbow':
    model = Word2Vec(
        sentences = text,
        window = 5,
        vector_size=300,
        min_count=5,
        sg=0,                      # 0 = cbow
        epochs=20
    )
    model.save(save_path)

  elif model_name == 'word2vec-sg': # Corrected indentation
    model = Word2Vec(
          sentences = text,
          window = 5,
          vector_size = 300,
          min_count = 5,
          sg = 1,                  # 1 = skipgram
          epochs = 20
      )
    model.save(save_path)

  elif model_name == 'fasttext': # Corrected indentation
    model = FastText(
            sentences = text,
            window = 5,
            vector_size = 100,
            min_n = 3,
            max_n = 5,
            min_count = 5,
            epochs = 10,
            sg = 1

        )
    model.save(save_path)

  else:
      raise ValueError("Invalid model name")

  return model # Return only the trained model

In [12]:
embedding_train(X_final, model_name= 'word2vec-cbow')
embedding_train(X_final, model_name='word2vec-sg')
embedding_train(X_final, model_name='fasttext')

<gensim.models.fasttext.FastText at 0x7833c5a063f0>

> ## Step 4 - Load and Test Models

> ### 1. Model Loader

In [13]:
import gensim.downloader as api

def load_model(model_name):

    if model_name == "word2vec-cbow":
        model = Word2Vec.load("/content/models/word2vec-cbow")
        return model.wv

    elif model_name == "word2vec-sg":
        model = Word2Vec.load("/content/models/word2vec-sg") # Corrected filename from word2vec_sg to word2vec-sg
        return model.wv

    elif model_name == "fasttext":
        model = FastText.load("/content/models/fasttext")
        return model.wv

    elif model_name == "glove":
        model = api.load("glove-wiki-gigaword-100")
        return model

    else:
        raise ValueError("Invalid model name")

    return model

In [14]:
cbow = load_model(model_name='word2vec-cbow')
sg = load_model(model_name='word2vec-sg')
fasttext = load_model(model_name='fasttext')

> ### 2. Word Similarity
- Cosine similarity
- Neighborhood consistency

In [15]:
def word_similarity(wv):

    pairs = [
    ("computer", "hardware"),
    ("windows", "microsoft"),
    ("baseball", "pitcher"),
    ("religion", "christian"),
    ("space", "nasa")
]

    for w1, w2 in pairs:
        if w1 in wv and w2 in wv:
            print(f"{w1} ~ {w2} :", wv.similarity(w1, w2))
        else:
            print(f"{w1} or {w2} not found in the vocabulary")

In [16]:
word_similarity(cbow)
word_similarity(sg)
word_similarity(fasttext)


print(len(cbow.index_to_key))
print(len(sg.index_to_key))
print(len(fasttext.index_to_key))


print("religion" in cbow)
print("religion" in sg)
print("religion" in fasttext)

computer ~ hardware : 0.42193297
windows ~ microsoft : 0.6925483
baseball ~ pitcher : 0.67440003
religion ~ christian : 0.6286671
space ~ nasa : 0.48824137
computer ~ hardware : 0.24264935
windows ~ microsoft : 0.57211584
baseball ~ pitcher : 0.41234586
religion ~ christian : 0.3764064
space ~ nasa : 0.40271625
computer ~ hardware : 0.4647874
windows ~ microsoft : 0.7184713
baseball ~ pitcher : 0.64109296
religion ~ christian : 0.6735169
space ~ nasa : 0.52174824
13455
13455
13455
True
True
True


> ## 3. Analogy Test

In [17]:
def analogy_test(wv):

    pairs = [
    ("baseball", "pitcher", "hockey"),
    ("windows", "microsoft", "linux"),
    ("space", "nasa", "earth")
]

    for a, b, c in pairs:

        if all(word in wv for word in [a, b, c]):

            result = wv.most_similar(
                positive=[b, c],   # b + c
                negative=[a],      # - a
                topn=1
            )

            print(f"{b} - {a} + {c} → {result[0][0]} (score={result[0][1]:.4f})")

        else:
            print(f"Missing words in vocab: {a}, {b}, {c}")

In [18]:
analogy_test(cbow)
analogy_test(sg)
analogy_test(fasttext)

pitcher - baseball + hockey → mvp (score=0.7014)
microsoft - windows + linux → isc (score=0.6568)
nasa - space + earth → mission (score=0.5621)
pitcher - baseball + hockey → tournament (score=0.4490)
microsoft - windows + linux → isc (score=0.4385)
nasa - space + earth → shafer (score=0.3411)
pitcher - baseball + hockey → gilkey (score=0.6812)
microsoft - windows + linux → ux (score=0.7030)
nasa - space + earth → earthly (score=0.6442)


> ### 4. OOV Testing

In [19]:
def oov_test(wv, model_name):

    test_words = [
        "computering",
        "religiosity",
        "baseballic",
        "xyzrandomword"
    ]

    print(f"\nOOV Testing for {model_name}")

    for word in test_words:
        if word in wv:
            print(word, "-> Exists")
        else:
            print(word, "-> OOV")


In [20]:
oov_test(cbow, "word2vec-cbow")
oov_test(sg, "word2vec-sg")
oov_test(fasttext, "fasttext")

print(len(cbow.index_to_key))


OOV Testing for word2vec-cbow
computering -> OOV
religiosity -> OOV
baseballic -> OOV
xyzrandomword -> OOV

OOV Testing for word2vec-sg
computering -> OOV
religiosity -> OOV
baseballic -> OOV
xyzrandomword -> OOV

OOV Testing for fasttext
computering -> Exists
religiosity -> Exists
baseballic -> Exists
xyzrandomword -> Exists
13455


> ### 5. Downstream Classification
- Step 1 : Document Embedding
- Step 2 : Build the Matrix

In [33]:
# document embedding
import numpy as np

def document_vector(doc, wv, dim):

    vectors = [wv[word] for word in doc if word in wv]

    if len(vectors) == 0:
        return np.zeros(dim)

    return np.mean(vectors, axis=0)

# bulid the matrix
def build_features(tokenized_docs, labels, wv, dim):

    X = []
    y = labels

    for doc in tokenized_docs:
        vec = document_vector(doc, wv, dim)
        X.append(vec)

    return np.array(X), np.array(y)

In [34]:
X_train_features, y_train_labels_for_clf = build_features(X_final, y_train, cbow, 300)
print(X_train_features[0:10], y_train_labels_for_clf[0:10])

[[-0.43881485 -0.19834305 -0.24444368 ...  0.02988239 -0.43765441
  -0.05863502]
 [-0.31540549 -0.03059752 -0.06693599 ...  0.25572109 -0.01122992
  -0.1991403 ]
 [-0.12215306  0.02826146 -0.07954741 ...  0.14835274 -0.22857264
  -0.08701497]
 ...
 [ 0.16630875  0.22080763  0.12913539 ...  0.62115443 -0.19018491
  -0.26410386]
 [ 0.38515699  0.02185475 -0.48475188 ... -0.3693566   0.42197284
   0.67591834]
 [-0.13825397  0.33182997 -0.02857118 ...  0.01054335 -0.12259699
  -0.27810469]] [ 7  4  4  1 14 16 13  3  2  4]


> ### Train the Classifier
- lr, svm
- Metric: accuracy, macro f1

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def downstream_classification(X_features, y_features):

  X_train, X_test, y_train, y_test = train_test_split(
    X_features,
    y_features,
    test_size=0.2,
    random_state=42,
    stratify=y_train_labels_for_clf   # Important for 20 classes
)

  clf = LogisticRegression(
    max_iter=2000,
    multi_class='multinomial',
    solver='lbfgs'
)

  clf.fit(X_train, y_train)

  preds = clf.predict(X_test)

  acc = accuracy_score(y_test, preds)
  f1 = f1_score(y_test, preds, average="macro")

  print("Accuracy:", acc)
  print("Macro F1:", f1)

  return acc, f1

In [37]:
print(X_train_features.shape)
print(y_train_labels_for_clf.shape)

(11314, 300)
(11314,)


In [39]:
# Pass the numerical features to the downstream_classification function
acc, f1 = downstream_classification(X_train_features, y_train_labels_for_clf)
print("Final Accuracy:", acc)
print("Final Macro F1:", f1)



Accuracy: 0.5603181617322138
Macro F1: 0.5518971605268959
Final Accuracy: 0.5603181617322138
Final Macro F1: 0.5518971605268959


> ### Final Testing Wrapper

In [40]:
def testing(model_name, tokenized_docs=None, labels=None, dim=100):

    wv = load_model(model_name)

    print(f"\n===== Testing {model_name} =====")

    # Intrinsic
    word_similarity_test(wv)
    analogy_test(wv)
    oov_test(wv, model_name)

    # Extrinsic (classification)
    if tokenized_docs is not None and labels is not None:

        X, y = build_features(tokenized_docs, labels, wv, dim)
        downstream_classification(X, y)

In [None]:
testing(
    model_name="word2vec-cbow",
    tokenized_docs=final_docs,
    labels=labels,
    dim=100
)