### Sklearn 20newsgroups dataset

In [1]:
pip install scikit-learn gensim nltk numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


### TF-IDF (Term Frequency - Inverse Document Frequency)

In [None]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import nltk
import re

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load dataset (using only first 1000 rows for efficiency)
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = dataset.data[:1000]  

# Preprocessing function
stopwords_list = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text.lower())  # Tokenization & Lowercasing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords_list]  # Remove stopwords & lemmatize
    return " ".join(tokens)


# Apply preprocessing
cleaned_texts = [preprocess_text(text) for text in texts]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
tfidf_array = tfidf_matrix.toarray()

# Print results
print("TF-IDF Matrix:")
print(tfidf_array)
print("TF-IDF Matrix Shape:", tfidf_array.shape)
print("Vocabulary:", vectorizer.get_feature_names_out()[:20])  # Print first 20 words




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
TF-IDF Matrix Shape: (1000, 18204)
Vocabulary: ['aa' 'aaa' 'aacc' 'aadnsi' 'aamir' 'aaph' 'aapjj' 'aapp' 'aaron'
 'aausmausmaedu' 'aaverage' 'aaxclear' 'aaxdeoofsqfjmxhhls' 'ab'
 'ababspalestinians' 'abandoned' 'abates' 'abberant' 'abbreviation' 'abc']


In [3]:
import multiprocessing

# Prepare data for Word2Vec
tokenized_texts = [text.split() for text in cleaned_texts]

# Train Word2Vec model
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=4,
                         window=4,
                         vector_size=300, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg = 1,
                         workers=cores-1)

w2v_model.build_vocab(tokenized_texts, progress_per=10000)
w2v_model.train(tokenized_texts, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

# Save the model 
w2v_model.save("word2vec2.model")


# Test Word2Vec
word = "computer"

result = w2v_model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

if word in w2v_model.wv:
    print(f"Vector for '{word}':", w2v_model.wv[word][:10])  
    print(f"Words similar to '{word}':", w2v_model.wv.most_similar(positive=[word]))
    print("husband - man + woman ≈", result)
else:
    print(f"'{word}' not in vocabulary")

Vector for 'computer': [-0.7163971   0.33736396  0.6031848  -0.02010767  0.20525196 -0.39639473
  0.27783233  0.7710676   0.50691307  0.2177041 ]
Words similar to 'computer': [('corporation', 0.410052627325058), ('tempest', 0.4002803564071655), ('desc', 0.3827262818813324), ('toshiba', 0.3660683035850525), ('kenneth', 0.36397117376327515), ('laptop', 0.35860878229141235), ('typewriter', 0.3546794652938843), ('bundled', 0.35299059748649597), ('dyer', 0.35050448775291443), ('math', 0.33884990215301514)]
husband - man + woman ≈ [('aged', 0.36183658242225647), ('seeking', 0.35281890630722046), ('agdam', 0.3526656925678253), ('abusive', 0.3457013666629791), ('injured', 0.322365939617157), ('attacker', 0.3205150365829468), ('morgue', 0.3199331760406494), ('sasha', 0.31863734126091003), ('artillery', 0.31837305426597595), ('elderly', 0.3148309290409088)]


#### Combined algorithm

In [4]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import nltk
from sklearn.metrics.pairwise import cosine_similarity


# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load dataset
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = dataset.data[:1000]  # Using a subset for faster training

# Preprocessing function
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Apply preprocessing
tokenized_texts = [preprocess_text(text) for text in texts]

# Train Word2Vec model
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(sentences=tokenized_texts,
                      min_count=4,
                         window=4,
                         vector_size=300, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg = 1,
                         workers=cores-1)
# Train TF-IDF model
corpus = [" ".join(tokens) for tokens in tokenized_texts]  # Convert tokenized text back to sentences
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_vocab = tfidf_vectorizer.vocabulary_

# Function to compute TF-IDF weighted Word2Vec vectors
def get_weighted_w2v_vector(text):
    tokens = preprocess_text(text)
    vector = np.zeros(300)  # Initialize empty vector of same size as Word2Vec embeddings
    total_weight = 0

    for word in tokens:
        if word in w2v_model.wv and word in tfidf_vocab:
            tfidf_weight = tfidf_matrix[0, tfidf_vocab[word]]  # Get TF-IDF weight
            vector += w2v_model.wv[word] * tfidf_weight  # Weighted vector sum
            total_weight += tfidf_weight

    if total_weight > 0:
        vector /= total_weight  # Normalize

    return vector


# Compute vectors for all documents
doc_vectors = np.array([get_weighted_w2v_vector(text) for text in texts])

# Compute similarity between the first document and all others
similarities = cosine_similarity([doc_vectors[0]], doc_vectors)[0]

# Rank documents by similarity
sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

print("Top 5 most similar documents:")
for idx in sorted_indices[1:6]:  # Skip the first one (itself)
    print(f"Document {idx} - Similarity Score: {similarities[idx]:.4f}")


# Find the most similar words to the first document vector
similar_words = w2v_model.wv.similar_by_vector(doc_vectors[0], topn=10)  
print("Most relevant words to the first document:", similar_words)


# Word analogy: husband - man + woman ≈ ?
try:
    analogy_vector = w2v_model.wv['husband'] - w2v_model.wv['man'] + w2v_model.wv['woman']
    analogy_result = w2v_model.wv.similar_by_vector(analogy_vector, topn=5)
    print("husband - man + woman ≈", analogy_result)
except KeyError as e:
    print(f"Error: {e} (Some words may be missing from vocabulary)")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 most similar documents:
Document 979 - Similarity Score: 0.9990
Document 491 - Similarity Score: 0.9989
Document 644 - Similarity Score: 0.9981
Document 941 - Similarity Score: 0.9977
Document 245 - Similarity Score: 0.9977
Most relevant words to the first document: [('start', 0.9987062811851501), ('wont', 0.998379647731781), ('havent', 0.9982263445854187), ('buy', 0.997989296913147), ('mine', 0.9979355335235596), ('sell', 0.997894823551178), ('ok', 0.9978894591331482), ('imagine', 0.9978559613227844), ('solution', 0.9978442192077637), ('driver', 0.9978318810462952)]
husband - man + woman ≈ [('national', 0.9823528528213501), ('april', 0.979651689529419), ('among', 0.9783909320831299), ('russian', 0.9755259156227112), ('university', 0.9753519296646118)]
