In [1]:
import pandas as pd

# Load the tokenized data from CSV file
df = pd.read_csv(r"C:\Users\Anuz\OneDrive\Desktop\excel work\stemmed_tokenized_cleaned_dataset.csv")

# Assuming your tokenized sentences are in a column named 'Concatenated Paragraphs'
tokenized_sentences = df['stemmed_text'].apply(eval).tolist()  # Convert string representation to list

# Verify loaded sentences
print(f"Loaded {len(tokenized_sentences)} tokenized sentences.")

Loaded 8639 tokenized sentences.


In [2]:
import numpy as np

class GloVe:
    def __init__(self, vocab_size, embedding_dim, window_size, learning_rate=0.05):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.word_embeddings = np.random.rand(vocab_size, embedding_dim) * 0.01  # Initialize word embeddings
        self.co_occurrence = np.zeros((vocab_size, vocab_size))

    def build_co_occurrence_matrix(self, tokenized_sentences):
        for sentence in tokenized_sentences:
            for i, word in enumerate(sentence):
                word_index = self.word_to_index(word)
                start = max(0, i - self.window_size)
                end = min(len(sentence), i + self.window_size + 1)
                for j in range(start, end):
                    if i != j:
                        context_word_index = self.word_to_index(sentence[j])
                        self.co_occurrence[word_index][context_word_index] += 1

    def word_to_index(self, word):
        # Map word to index (this should depend on how you handle vocab)
        return hash(word) % self.vocab_size  # Placeholder for actual mapping based on vocab

    def train(self, epochs):
        for epoch in range(epochs):
            for i in range(self.vocab_size):
                for j in range(self.vocab_size):
                    if self.co_occurrence[i][j] > 0:
                        # Calculate loss and update embeddings with numerical stability
                        dot_product = np.dot(self.word_embeddings[i], self.word_embeddings[j])
                        loss = self.co_occurrence[i][j] - dot_product
                        
                        # Update embeddings with clipping to avoid overflow
                        update_i = self.learning_rate * loss * self.word_embeddings[j]
                        update_j = self.learning_rate * loss * self.word_embeddings[i]

                        # Clipping updates to prevent overflow
                        update_i = np.clip(update_i, -1.0, 1.0)
                        update_j = np.clip(update_j, -1.0, 1.0)

                        self.word_embeddings[i] += update_i
                        self.word_embeddings[j] += update_j
            
            if epoch % 10 == 0:
                print(f'Epoch {epoch}: Training GloVe model...')

In [3]:
unique_tokens = set()
for tokens in tokenized_sentences:
    # Assuming tokens are stored as space-separated strings; adjust if necessary
    unique_tokens.update(tokens)  # Split by spaces to get individual tokens

vocab_size = len(unique_tokens)
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 43047


In [4]:
# Example parameters (adjust these according to your dataset)
vocab_size = 43047  # Adjust this based on your actual vocabulary size
embedding_dim = 100
window_size = 5

# Initialize GloVe model
glove_model = GloVe(vocab_size=vocab_size, embedding_dim=embedding_dim, window_size=window_size)

# Build co-occurrence matrix
glove_model.build_co_occurrence_matrix(tokenized_sentences)

# Train GloVe model
glove_model.train(epochs=100)

print("GloVe training completed.")

Epoch 0: Training GloVe model...
Epoch 10: Training GloVe model...
Epoch 20: Training GloVe model...
Epoch 30: Training GloVe model...
Epoch 40: Training GloVe model...
Epoch 50: Training GloVe model...
Epoch 60: Training GloVe model...
Epoch 70: Training GloVe model...
Epoch 80: Training GloVe model...
Epoch 90: Training GloVe model...
GloVe training completed.


In [27]:
# Save the embeddings after training
output_file_path = r"C:\Users\Anuz\OneDrive\Desktop\excel work\Embeddings.txt"  # Update with your desired file path

# Create reverse vocabulary mapping
word_to_index = {word: idx for idx, word in enumerate(set(word for sentence in tokenized_sentences for word in sentence))}
index_to_word = {idx: word for word, idx in word_to_index.items()}

with open(output_file_path, 'w', encoding='utf-8') as f:
    for i in range(len(glove_model.word_embeddings)):  # Use the length of the embeddings
        word = index_to_word[i]  # Get the word corresponding to the index
        embedding_vector = ' '.join(map(str, glove_model.word_embeddings[i]))  # Convert vector to string
        f.write(f"{word} {embedding_vector}\n")  # Write word and its corresponding embedding

print("Embeddings saved to", output_file_path)

Embeddings saved to C:\Users\Anuz\OneDrive\Desktop\excel work\Embeddings.txt
