 Lab Assignment 6: Word Embeddings with word2vec
•	Train a word2vec model on a small corpus using Gensim.
•	Visualize word embeddings using t-SNE.
•	Find similar words using cosine similarity.

In [None]:
! pip install scipy==1.10.1 numpy==1.23.5 gensim==4.3.2
! pip install scikit-learn nltk matplotlib

In [None]:
# Step 2: Import necessary modules
import gensim
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re

# Step 3: Sample text corpus (you can replace it with your own)
corpus = """
Natural language processing enables machines to understand and interpret human language.
It involves various tasks like sentiment analysis, named entity recognition, and machine translation.
Deep learning has significantly improved NLP by using models like transformers and word embeddings.
Word2Vec is an efficient model to learn vector representations of words based on context.
"""

# Step 4: Preprocess text
nltk.download('punkt_tab')
sentences = sent_tokenize(corpus.lower())
tokenized_sentences = [word_tokenize(re.sub(r'[^\w\s]', '', sent)) for sent in sentences]

# Step 5: Train Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=2, sg=1)  # sg=1 = skip-gram

# Step 6: Explore vocabulary
print("Vocabulary:", list(model.wv.key_to_index.keys()))

# Step 7: Find similar words
word = 'language'
print(f"\nWords similar to '{word}':")
print(model.wv.most_similar(word))

# Step 8: Visualize embeddings using t-SNE
import numpy as np
def visualize_embeddings(model):
    words = list(model.wv.key_to_index.keys())
    word_vectors = [model.wv[word] for word in words]

    tsne = TSNE(n_components=2, random_state=0)
    reduced_vectors = np.array([model.wv[word] for word in words])

    plt.figure(figsize=(10, 6))
    for i, word in enumerate(words):
        plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
        plt.annotate(word, xy=(reduced_vectors[i, 0], reduced_vectors[i, 1]))
    plt.title("t-SNE Visualization of Word Embeddings")
    plt.show()

visualize_embeddings(model)