In [3]:
# pip install gensim

In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

In [7]:
# Load the dataset
try:
    file_path = 'data_files/BBC News Train.csv'
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print("Error: Dataset not found!!!! ")

In [9]:
if 'df' in locals():
    print(df.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business


In [11]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


##### Preprocess for Word2Vec

In [14]:
stop_words = set(stopwords.words('english'))

In [16]:
def preprocess_text_w2v(text):
    """
    Prepares text for Word2Vec:
    1. Tokenize
    2. Lowercase
    3. Remove punctuation
    4. Remove stop words
    5. Remove short words (<= 2 chars)
    """
    tokens = nltk.word_tokenize(text)
    processed_tokens = []
    
    for token in tokens:
        token = token.lower()
        if (token not in stop_words and
            token not in string.punctuation and
            token.isalpha()): # Remove numbers
            
            if len(token) > 2:
                processed_tokens.append(token)
                
    return processed_tokens

In [18]:
if 'df' in locals():
    print("\nStarting text preprocessing for Word2Vec...")
    
    # Apply the preprocessing to all documents
    corpus = [preprocess_text_w2v(doc) for doc in df['Text']]
    
    print("Preprocessing complete.")
    
    # Print an example (first 20 tokens of the first doc)
    print("\n--- Example Processed Document (as a list of tokens) ---")
    print(corpus[0][:20])


Starting text preprocessing for Word2Vec...
Preprocessing complete.

--- Example Processed Document (as a list of tokens) ---
['worldcom', 'launches', 'defence', 'lawyers', 'defending', 'former', 'worldcom', 'chief', 'bernie', 'ebbers', 'battery', 'fraud', 'charges', 'called', 'company', 'whistleblower', 'first', 'witness', 'cynthia', 'cooper']


##### Train the Word2Vec Model

In [21]:
if 'corpus' in locals():
    print("\nTraining Word2Vec model...")
    
    # Train the model
    # Key parameters:
    # - corpus: Our list of lists
    # - vector_size: The dimensionality of the word vector (100 is a good default)
    # - window: The max distance between a target word and its neighbors (5)
    # - min_count: Ignores all words with a total frequency lower than this (5)
    # - workers: Number of CPU cores to use (4)
    model = Word2Vec(
        sentences=corpus,
        vector_size=100,
        window=5,
        min_count=5,
        workers=4
    )
    
    print("Model training complete.")
    print(f"\nVocabulary size: {len(model.wv.key_to_index)} words")


Training Word2Vec model...
Model training complete.

Vocabulary size: 8076 words


##### Explore the Embeddings

In [24]:
if 'model' in locals():
    
    # --- 1. Find Most Similar Words ---
    # Check words related to the dataset's topics
    
    print("\n--- Most similar to 'government' ---")
    try:
        print(model.wv.most_similar('government', topn=5))
    except KeyError:
        print("'government' not in vocabulary (or failed min_count).")

    print("\n--- Most similar to 'sport' ---")
    try:
        print(model.wv.most_similar('sport', topn=5))
    except KeyError:
        print("'sport' not in vocabulary.")
        
    print("\n--- Most similar to 'music' ---")
    try:
        print(model.wv.most_similar('music', topn=5))
    except KeyError:
        print("'music' not in vocabulary.")

    # --- 2. Find the Outlier ---
    print("\n--- Which word doesn't match? ---")
    try:
        # The model should identify 'election' as the outlier
        outlier = model.wv.doesnt_match(['film', 'music', 'show', 'election'])
        print(f"Outlier in ['film', 'music', 'show', 'election']: {outlier}")
    except KeyError:
        print("One or more words not in vocabulary.")


--- Most similar to 'government' ---
[('said', 0.9788783192634583), ('party', 0.9665631651878357), ('labour', 0.9626592397689819), ('tory', 0.9620104432106018), ('tories', 0.9597377777099609)]

--- Most similar to 'sport' ---
[('programme', 0.9977071285247803), ('today', 0.9939746260643005), ('correspondent', 0.992902398109436), ('website', 0.9894682168960571), ('reporters', 0.9870164394378662)]

--- Most similar to 'music' ---
[('digital', 0.9752787351608276), ('video', 0.9565134644508362), ('phones', 0.9436734318733215), ('phone', 0.9340487718582153), ('technology', 0.9312829375267029)]

--- Which word doesn't match? ---
Outlier in ['film', 'music', 'show', 'election']: election


##### The Analogy Task (Vector Arithmetic)

In [27]:
if 'model' in locals():
    print("\n--- Analogy Task ---")
    
    # Analogy 1: king - man + woman = ?
    try:
        result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
        print(f"king - man + woman = {result}")
    except KeyError:
        print("Skipping 'king/man/woman' analogy, words not in vocab.")

    # Let's try an analogy that's more likely to be in our dataset
    # Analogy 2: britain - london + paris = ? (Should be 'france')
    try:
        result = model.wv.most_similar(positive=['britain', 'paris'], negative=['london'], topn=1)
        print(f"\nbritain - london + paris = {result}")
    except KeyError:
        print("Skipping 'britain/london/paris' analogy, words not in vocab.")
        
    # Analogy 3: he - man + woman = ? (Should be 'she')
    try:
        result = model.wv.most_similar(positive=['he', 'woman'], negative=['man'], topn=1)
        print(f"\nhe - man + woman = {result}")
    except KeyError:
        print("Skipping 'he/man/woman' analogy, words not in vocab.")


--- Analogy Task ---
king - man + woman = [('started', 0.9987601041793823)]

britain - london + paris = [('run', 0.9950436353683472)]
Skipping 'he/man/woman' analogy, words not in vocab.
