In [2]:
%matplotlib widget
import plotly.graph_objects as go
import numpy as np
from scipy.sparse import load_npz
from tqdm import tqdm


## Dataset

In [10]:
from datasets import load_dataset
# Load the dataset
ds = load_dataset("ccdv/arxiv-classification", "no_ref")

# Basic statistics
print("Splits:", ds.keys())
print("Train size:", len(ds["train"]))
print("Test size:", len(ds["test"]))
print("Features:", ds["train"].features)
print("Example record:", ds["train"][0])

# Dictionary mapping from numeric labels to class names
arxiv_subjects = {
    '0': 'Commutative Algebra',
    '1': 'Computer Vision',
    '2': 'Artificial Intelligence',
    '3': 'Systems and Control',
    '4': 'Group Theory',
    '5': 'Computational Engineering',
    '6': 'Programming Languages',
    '7': 'Information Theory',
    '8': 'Data Structures',
    '9': 'Neural and Evolutionary',
    '10': 'Statistics Theory'
}

Splits: dict_keys(['train', 'validation', 'test'])
Train size: 28388
Test size: 2500
Features: {'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['math.AC', 'cs.CV', 'cs.AI', 'cs.SY', 'math.GR', 'cs.CE', 'cs.PL', 'cs.IT', 'cs.DS', 'cs.NE', 'math.ST'], id=None)}
Example record: {'text': 'Constrained Submodular Maximization via a\nNon-symmetric Technique\n\narXiv:1611.03253v1 [] 10 Nov 2016\n\nNiv Buchbinder∗\n\nMoran Feldman†\n\nNovember 11, 2016\n\nAbstract\nThe study of combinatorial optimization problems with a submodular objective has attracted\nmuch attention in recent years. Such problems are important in both theory and practice because\ntheir objective functions are very general. Obtaining further improvements for many submodular\nmaximization problems boils down to finding better algorithms for optimizing a relaxation of\nthem known as the multilinear extension.\nIn this work we present an algorithm for optimizing the multilinear relaxation whose guarantee impr

# TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6

data = ds["train"][:]['text']

data_train_size_mb = size_mb(data)
print(f"Size of the data set: {data_train_size_mb:.2f} MB")

vectorizer = TfidfVectorizer(
        sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english", token_pattern=r'\b[a-zA-Z]{2,}\b'
    )
tfidf_matrix = vectorizer.fit_transform(data)
print(f"Number of features: {tfidf_matrix.shape[1]}")

feature_names = vectorizer.get_feature_names_out()

feature_names

Size of the training set: 1650.25 MB
Number of features: 133428


array(['aa', 'aaa', 'aaaa', ..., 'zzu', 'zzz', 'zzzz'], dtype=object)

In [None]:
from scipy.sparse import save_npz

# Save the tfidf_matrix to a file
save_npz("TF-IDF embeddings/tfidf_matrix.npz", tfidf_matrix)

# Save the feature names to a file
with open("TF-IDF embeddings/feature_names.txt", "w") as f:
    for name in feature_names:
        f.write(f"{name}\n")

print("TF-IDF matrix saved successfully!")

TF-IDF matrix saved successfully!


In [6]:
try:
    tfidf_matrix
except NameError:
    print("TF-IDF matrix is not loaded. Loading now...")
    tfidf_matrix = load_npz("TF-IDF embeddings/tfidf_matrix.npz")
    with open("TF-IDF embeddings/feature_names.txt", "r") as f:
        feature_names = [line.strip() for line in f.readlines()]

TF-IDF matrix is not loaded. Loading now...


In [7]:
tfidf_matrix.shape

(28388, 133428)

# Word2Vec vectors

In [11]:
import gensim.downloader as api
import logging
import os
import gensim

# Set up logging to display information
logging.basicConfig(level=logging.INFO)

# Define the model name
model_name = 'word2vec-google-news-300'

if os.path.isfile("models/word2vec-google-news-300.model"):
    print("Model already exists, loading from file...")
    # Load the model from the file
    wv = gensim.models.KeyedVectors.load("models/word2vec-google-news-300.model")
else:
    # Attempt to load the model
    try:
        print(f"\nAttempting to download '{model_name}' using gensim downloader...")
        # Load the model using gensim's downloader
        wv = api.load(model_name)
        print("\nModel downloaded/loaded successfully!")
        print(f"It is now cached in: {api.BASE_DIR}")

        # Save the Word2Vec model
        if wv:
            model_path = "models/word2vec.model"
            print(f"Saving the Word2Vec model to '{model_path}'...")
            wv.save(model_path)
            print("Model saved successfully!")
        else:
            print("Word2Vec model is not loaded, so it cannot be saved.")

    except Exception as e:
        print(f"\nFailed to download or load using gensim downloader: {e}")
        wv = None  # Ensure wv is defined even if loading fails

# Check if the model was loaded successfully
if wv:
    print("Word2Vec model is ready to use.")
else:
    print("Word2Vec model could not be loaded.")

INFO:gensim.utils:loading KeyedVectors object from models/word2vec-google-news-300.model


Model already exists, loading from file...


INFO:gensim.utils:loading vectors from models/word2vec-google-news-300.model.vectors.npy with mmap=None
INFO:gensim.utils:KeyedVectors lifecycle event {'fname': 'models/word2vec-google-news-300.model', 'datetime': '2025-05-08T14:41:50.820002', 'gensim': '4.3.3', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.26100-SP0', 'event': 'loaded'}


Word2Vec model is ready to use.


In [8]:
from numba import njit 

In [None]:
try:
    tfidf_matrix
except NameError:
    print("TF-IDF matrix is not loaded. Loading now...")
    tfidf_matrix = load_npz("TF-IDF embeddings/tfidf_matrix.npz")
    with open("TF-IDF embeddings/feature_names.txt", "r") as f:
        feature_names = [line.strip() for line in f.readlines()]

word2vec_embedding = np.zeros((tfidf_matrix.shape[0],300))

print("loading dataset...")
texts = ds["train"][:]['text']
print("dataset loaded successfully!")

print("tokenizing texts...")
tokenized_texts = [text.split() for text in texts]
print("filtering texts...")
filtered_texts = [[word for word in text if word in wv.key_to_index] for text in tokenized_texts]
print("texts filtered successfully!")

print("calculating word2vec embeddings...")
for i, text in enumerate(tqdm(texts, desc="Processing texts")):
    text = text.split()
    text = [word for word in text if word in wv.key_to_index]  # Filter words not in the model's vocabulary

    # Get the word vectors for the words in the text
    word_vectors = np.stack([wv[word] for word in text])

    feature_index = {word: idx for idx, word in enumerate(feature_names)}
    # Get the corresponding TF-IDF weights for the words in the text
    tfidf_weights = np.array([tfidf_matrix[i, feature_index.get(word, 0)] for word in text])
    # Calculate the weighted average of the word vectors using TF-IDF weights
    weighted_average_vector = np.average(word_vectors, axis=0, weights=tfidf_weights+1e-10)  # Adding a small constant to avoid division by zero
    word2vec_embedding[i]=weighted_average_vector 

# save word2vec_embedding to a file
np.savez_compressed("Word2Vec embeddings/word2vec_embedding.npz", word2vec_embedding=word2vec_embedding)
print("Word2Vec embeddings saved successfully!")

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mUntyped global name 'tqdm':[0m [1m[1mCannot determine Numba type of <class 'type'>[0m
[1m
File "..\..\AppData\Local\Temp\ipykernel_24096\431137983.py", line 15:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
[0m[1mDuring: Pass nopython_type_inference[0m

In [17]:
# save word2vec_embedding to a file
np.savez_compressed("Word2Vec embeddings/word2vec_embedding.npz", word2vec_embedding=word2vec_embedding)
print("Word2Vec embeddings saved successfully!")

Word2Vec embeddings saved successfully!


In [3]:
# Load the Word2Vec embeddings
word2vec_embeddings = np.load("Word2Vec embeddings/word2vec_embedding.npz")["word2vec_embedding"]
print(f"Word2Vec embeddings shape: {word2vec_embeddings.shape}")

Word2Vec embeddings shape: (28388, 300)


In [5]:
from sklearn.decomposition import PCA

# Assuming `data` is your 300-dimensional dataset (shape: [n_samples, 300])
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(word2vec_embeddings)

# colors = np.array([wv.similarity(center_word, word) for word in words])
# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_data[:, 0],
    y=reduced_data[:, 1],
    z=reduced_data[:, 2],
    mode='markers+text',
    marker=dict(
        size=5,
        # color=colors,
        colorscale='Plasma',
        opacity=0.8
    ),
    # text=words  # Add word labels
)])

fig.update_layout(
    # title=f"3D Scatter Plot of Word Vectors for '{center_word}'",
    scene=dict(
        xaxis_title='PCA 1',
        yaxis_title='PCA 2',
        zaxis_title='PCA 3'
    ),
    coloraxis_colorbar=dict(
        title="Similarity",
        thickness=20,
        len=0.75,
        x=1.1  # Position the colorbar slightly outside the plot
    ),
    width=1000,
    height=800,
)

fig.show()

In [None]:
try:
    tfidf_matrix
except NameError:
    print("TF-IDF matrix is not loaded. Loading now...")
    tfidf_matrix = load_npz("TF-IDF embeddings/tfidf_matrix.npz")
    with open("TF-IDF embeddings/feature_names.txt", "r") as f:
        feature_names = [line.strip() for line in f.readlines()]

word2vec_embedding = np.zeros((tfidf_matrix.shape[0], 300))

texts = ds["train"][:]['text']

# Tokenize and filter words in the vocabulary
tokenized_texts = [text.split() for text in texts]
filtered_texts = [[word for word in text if word in wv.key_to_index] for text in tokenized_texts]

# Create a mapping of feature names to indices
feature_index = {word: idx for idx, word in enumerate(feature_names)}

# Vectorize the process
for i, text in enumerate(tqdm(filtered_texts, desc="Processing texts")):
    if text:  # Ensure the text is not empty after filtering
        word_vectors = np.stack([wv[word] for word in text])
        tfidf_weights = np.array([tfidf_matrix[i, feature_index.get(word, 0)] for word in text]).flatten()
        weighted_average_vector = np.average(word_vectors, axis=0, weights=tfidf_weights)
        word2vec_embedding[i] = weighted_average_vector

In [None]:
# === 1. Get Closest Words (Most Similar Vectors) ===
try:
    print("\n--- Finding words closest to 'computer' ---")
    similar_words = wv.most_similar('computer', topn=10)

    print("Words most similar to 'computer':")
    for word, score in similar_words:
        print(f"- {word} (Similarity: {score:.4f})")

except NameError:
    print("Error: The 'wv' object is not loaded.")
except KeyError as e:
    print(f"Error: Word '{e}' not found in the vocabulary for similarity check.")
except Exception as e:
    print(f"An error occurred: {e}")

# === 2. Access a Specific Word's Vector ===
try:
    print("\n--- Accessing the vector for 'tree' ---")
    vector_tree = wv['tree']

    print(f"Successfully retrieved vector for 'tree'.")
    print(f"Vector dimensions (shape): {vector_tree.shape}")

except NameError:
    print("Error: The 'wv' object is not loaded.")
except KeyError as e:
    print(f"Error: Word '{e}' not found in the vocabulary.")
except Exception as e:
    print(f"An error occurred: {e}")


--- Finding words closest to 'computer' ---
Words most similar to 'computer':
- computers (Similarity: 0.7979)
- laptop (Similarity: 0.6640)
- laptop_computer (Similarity: 0.6549)
- Computer (Similarity: 0.6473)
- com_puter (Similarity: 0.6082)
- technician_Leonard_Luchko (Similarity: 0.5663)
- mainframes_minicomputers (Similarity: 0.5618)
- laptop_computers (Similarity: 0.5585)
- PC (Similarity: 0.5540)
- maker_Dell_DELL.O (Similarity: 0.5519)

--- Accessing the vector for 'tree' ---
Successfully retrieved vector for 'tree'.
Vector dimensions (shape): (300,)
