### Sklearn 20newsgroups dataset

In [None]:
pip install scikit-learn gensim nltk numpy

### TF-IDF (Term Frequency - Inverse Document Frequency)

In [5]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import nltk
import re

# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load dataset (using only first 100 rows for efficiency)
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = dataset.data[:1000]  

# Preprocessing function
stopwords_list = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text.lower())  # Tokenization & Lowercasing
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords_list]  # Remove stopwords & lemmatize
    return " ".join(tokens)

# Apply preprocessing
cleaned_texts = [preprocess_text(text) for text in texts]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
tfidf_array = tfidf_matrix.toarray()

# Print results
print("TF-IDF Matrix:")
print(tfidf_array)
print("TF-IDF Matrix Shape:", tfidf_array.shape)
print("Vocabulary:", vectorizer.get_feature_names_out()[:20])  # Print first 20 words




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TF-IDF Matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
TF-IDF Matrix Shape: (1000, 18204)
Vocabulary: ['aa' 'aaa' 'aacc' 'aadnsi' 'aamir' 'aaph' 'aapjj' 'aapp' 'aaron'
 'aausmausmaedu' 'aaverage' 'aaxclear' 'aaxdeoofsqfjmxhhls' 'ab'
 'ababspalestinians' 'abandoned' 'abates' 'abberant' 'abbreviation' 'abc']


In [9]:
import multiprocessing

# Prepare data for Word2Vec
tokenized_texts = [text.split() for text in cleaned_texts]

# Train Word2Vec model
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=4,
                         window=4,
                         vector_size=300, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg = 1,
                         workers=cores-1)

w2v_model.build_vocab(tokenized_texts, progress_per=10000)
w2v_model.train(tokenized_texts, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

# Save the model 
w2v_model.save("word2vec2.model")


# Test Word2Vec
word = "computer"

result = w2v_model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

if word in w2v_model.wv:
    print(f"Vector for '{word}':", w2v_model.wv[word][:10])  
    print(f"Words similar to '{word}':", w2v_model.wv.most_similar(positive=[word]))
    print("husband - man + woman ≈", result)
else:
    print(f"'{word}' not in vocabulary")

'man' exists in the vocabulary
'woman' exists in the vocabulary
'wife' exists in the vocabulary
Vector for 'wife': [-0.07911736  0.072822   -0.80405515 -0.7014165   0.05498299 -0.05689457
 -0.5800458  -0.0318243  -0.04190193  0.75361246]
Words similar to 'wife': [('liturgy', 0.368179053068161), ('torture', 0.358430415391922), ('joseph', 0.35421454906463623), ('sunday', 0.3444647789001465), ('convenient', 0.3442094027996063), ('execution', 0.3381751477718353), ('employee', 0.3375227153301239), ('daughter', 0.3373296558856964), ('classic', 0.33692508935928345), ('dear', 0.3363684117794037)]
king - man + woman ≈ [('aged', 0.3783465325832367), ('abusive', 0.35764965415000916), ('seeking', 0.3521312177181244), ('massacred', 0.3485230505466461), ('agdam', 0.3473351299762726), ('effendi', 0.3434714376926422), ('azeri', 0.3377446234226227), ('prevalence', 0.3314957320690155), ('morgue', 0.33065593242645264), ('median', 0.32908132672309875)]


#### Combined algorithm

In [15]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups
import nltk
from sklearn.metrics.pairwise import cosine_similarity


# Download necessary NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Load dataset
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = dataset.data[:1000]  # Using a subset for faster training

# Preprocessing function
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

# Apply preprocessing
tokenized_texts = [preprocess_text(text) for text in texts]

# Train Word2Vec model
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(sentences=tokenized_texts,
                      min_count=4,
                         window=4,
                         vector_size=300, 
                         alpha=0.03, 
                         min_alpha=0.0007, 
                         sg = 1,
                         workers=cores-1)
# Train TF-IDF model
corpus = [" ".join(tokens) for tokens in tokenized_texts]  # Convert tokenized text back to sentences
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_vocab = tfidf_vectorizer.vocabulary_

# Function to compute TF-IDF weighted Word2Vec vectors
def get_weighted_w2v_vector(text):
    tokens = preprocess_text(text)
    vector = np.zeros(300)  # Initialize empty vector of same size as Word2Vec embeddings
    total_weight = 0

    for word in tokens:
        if word in w2v_model.wv and word in tfidf_vocab:
            tfidf_weight = tfidf_matrix[0, tfidf_vocab[word]]  # Get TF-IDF weight
            vector += w2v_model.wv[word] * tfidf_weight  # Weighted vector sum
            total_weight += tfidf_weight

    if total_weight > 0:
        vector /= total_weight  # Normalize

    return vector


# Compute vectors for all documents
doc_vectors = np.array([get_weighted_w2v_vector(text) for text in texts])

# Compute similarity between the first document and all others
similarities = cosine_similarity([doc_vectors[0]], doc_vectors)[0]

# Rank documents by similarity
sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order

print("Top 5 most similar documents:")
for idx in sorted_indices[1:6]:  # Skip the first one (itself)
    print(f"Document {idx} - Similarity Score: {similarities[idx]:.4f}")


# Find the most similar words to the first document vector
similar_words = w2v_model.wv.similar_by_vector(doc_vectors[0], topn=10)  
print("Most relevant words to the first document:", similar_words)


# Word analogy: husband - man + woman ≈ ?
try:
    analogy_vector = w2v_model.wv['husband'] - w2v_model.wv['man'] + w2v_model.wv['woman']
    analogy_result = w2v_model.wv.similar_by_vector(analogy_vector, topn=5)
    print("husband - man + woman ≈", analogy_result)
except KeyError as e:
    print(f"Error: {e} (Some words may be missing from vocabulary)")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khaye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Top 5 most similar documents:
Document 979 - Similarity Score: 0.9987
Document 491 - Similarity Score: 0.9983
Document 567 - Similarity Score: 0.9981
Document 384 - Similarity Score: 0.9978
Document 843 - Similarity Score: 0.9977
Most relevant words to the first document: [('start', 0.9987203478813171), ('havent', 0.998412549495697), ('wont', 0.9981712102890015), ('sell', 0.9979532957077026), ('driver', 0.9979207515716553), ('mine', 0.9979206323623657), ('solution', 0.9978811740875244), ('heard', 0.9978165626525879), ('detail', 0.9977374076843262), ('perhaps', 0.9975965023040771)]
husband - man + woman ≈ [('national', 0.9843640923500061), ('april', 0.9805154204368591), ('center', 0.9789007306098938), ('russian', 0.9752913117408752), ('army', 0.9752044081687927)]
