# NLP Project: Word2Vec and Embeddings Analysis

This notebook implements Word2Vec embeddings for a dataset of questions, visualizes the embeddings in both 2D and 3D, and calculates similarities between questions using cosine similarity. It also demonstrates how to retrieve similar questions for a given query.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import gensim
import re
from sklearn.metrics.pairwise import cosine_similarity

## Data Preparation
The dataset consists of Stack Overflow questions with titles, tags, and question bodies. We preprocess the text data by cleaning and tokenizing it.

In [None]:
train_data = pd.read_csv('train.csv')[['Title', 'Tags', 'Body']]
valid_data = pd.read_csv('valid.csv')[['Title', 'Tags', 'Body']]

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()
    return text

train_data['Title'] = train_data['Title'].apply(preprocess_text)
train_data['Body'] = train_data['Body'].apply(preprocess_text)
valid_data['Title'] = valid_data['Title'].apply(preprocess_text)
valid_data['Body'] = valid_data['Body'].apply(preprocess_text)

## Training Word2Vec Model
We use the Gensim library to train a Word2Vec model on the titles and bodies of the questions.

In [None]:
sentences = train_data['Title'].tolist() + train_data['Body'].tolist()
sentences = [sentence.split() for sentence in sentences if isinstance(sentence, str)]

model = gensim.models.Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=2, workers=4)
model.save('word2vec_model.model')  # Save the model for future use

## Embedding Visualization
We visualize the embeddings using PCA for 3D and t-SNE/UMAP for 2D visualizations.

In [None]:
word_vectors = model.wv
words = list(word_vectors.index_to_key)[:100]
embeddings = np.array([word_vectors[word] for word in words])

# PCA for 3D visualization
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot 3D visualization
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2])
for i, word in enumerate(words):
    ax.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], reduced_embeddings[i, 2], word)
plt.title('Word Embeddings (3D)')
plt.show()

# t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
reduced_embeddings_2d = tsne.fit_transform(embeddings)

plt.scatter(reduced_embeddings_2d[:, 0], reduced_embeddings_2d[:, 1])
for i, word in enumerate(words):
    plt.text(reduced_embeddings_2d[i, 0], reduced_embeddings_2d[i, 1], word)
plt.title('Word Embeddings (2D)')
plt.show()

## Finding Similar Questions
We compute document vectors by averaging the word vectors of each question and use cosine similarity to find the most similar questions.

In [None]:
def document_vector(doc):
    words = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean([model.wv[word] for word in words], axis=0) if words else np.zeros(model.vector_size)

query = 'How to train a machine learning model?'
query_vector = document_vector(query)

train_data['DocVector'] = train_data['Title'] + ' ' + train_data['Body']
train_data['DocVector'] = train_data['DocVector'].apply(document_vector)

similarities = train_data['DocVector'].apply(lambda x: cosine_similarity([query_vector], [x])[0][0])
top_matches = train_data.iloc[similarities.nlargest(5).index]
print('Query:', query)
print('Top 5 Similar Questions:')
for i, row in top_matches.iterrows():
    print(f"{i+1}. {row['Title']} (Similarity: {similarities[i]:.2f})")