## Dataset

In [None]:
from datasets import load_dataset
# Load the dataset
ds = load_dataset("ccdv/arxiv-classification", "no_ref")

# Basic statistics
print("Splits:", ds.keys())
print("Train size:", len(ds["train"]))
print("Test size:", len(ds["test"]))
print("Features:", ds["train"].features)
print("Example record:", ds["train"][0])

# Dictionary mapping from numeric labels to class names
arxiv_subjects = {
    '0': 'Commutative Algebra',
    '1': 'Computer Vision',
    '2': 'Artificial Intelligence',
    '3': 'Systems and Control',
    '4': 'Group Theory',
    '5': 'Computational Engineering',
    '6': 'Programming Languages',
    '7': 'Information Theory',
    '8': 'Data Structures',
    '9': 'Neural and Evolutionary',
    '10': 'Statistics Theory'
}

# Word2Vec vectors

In [None]:
import gensim.downloader as api
import logging

# Set up logging to display information
logging.basicConfig(level=logging.INFO)

# Define the model name
model_name = 'word2vec-google-news-300'

# Attempt to load the model
try:
    print(f"\nAttempting to download '{model_name}' using gensim downloader...")
    # Load the model using gensim's downloader
    wv = api.load(model_name)
    print("\nModel downloaded/loaded successfully!")
    print(f"It is now cached in: {api.BASE_DIR}")

except Exception as e:
    print(f"\nFailed to download or load using gensim downloader: {e}")
    wv = None  # Ensure wv is defined even if loading fails

# Check if the model was loaded successfully
if wv:
    print("Word2Vec model is ready to use.")
else:
    print("Word2Vec model could not be loaded.")

In [None]:
# === 1. Get Closest Words (Most Similar Vectors) ===
try:
    print("\n--- Finding words closest to 'computer' ---")
    similar_words = wv.most_similar('computer', topn=5)

    print("Words most similar to 'computer':")
    for word, score in similar_words:
        print(f"- {word} (Similarity: {score:.4f})")

except NameError:
    print("Error: The 'wv' object is not loaded.")
except KeyError as e:
    print(f"Error: Word '{e}' not found in the vocabulary for similarity check.")
except Exception as e:
    print(f"An error occurred: {e}")

# === 2. Access a Specific Word's Vector ===
try:
    print("\n--- Accessing the vector for 'tree' ---")
    vector_tree = wv['tree']

    print(f"Successfully retrieved vector for 'tree'.")
    print(f"Vector dimensions (shape): {vector_tree.shape}")

except NameError:
    print("Error: The 'wv' object is not loaded.")
except KeyError as e:
    print(f"Error: Word '{e}' not found in the vocabulary.")
except Exception as e:
    print(f"An error occurred: {e}")