In [None]:
from tqdm import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

In [None]:
!pip install transformers
!pip install tensorflow

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CL_final/data/tokenized_rest_lyrics_for_embeddings.csv')

# Preprocess data and create vocabulary
lyrics_data = df['clean'].astype(str)  # Convert to string type
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lyrics_data)
vocabulary_size = len(tokenizer.word_index) + 1  # Add 1 for padding token
print('start')

start


In [None]:
vocabulary_size

288380

In [None]:
# Access the word index
word_index = tokenizer.word_index
print("Word Index:")
word_index

# # Access the word counts
# word_counts = tokenizer.word_counts
# print("\nWord Counts:")
# print(word_counts)

In [None]:
# Generate training data
window_size = 2
training_data = []
for lyrics in tqdm(lyrics_data):
  word_sequence = tokenizer.texts_to_sequences([lyrics])[0]
  for i in range(window_size, len(word_sequence) - window_size):
    target_word = word_sequence[i]
    context_words = word_sequence[i - window_size: i] + word_sequence[i + 1: i + window_size + 1]
    training_data.append((context_words, target_word))

# Prepare input and output data
X = []
y = []
for context_words, target_word in training_data:
  X.append(context_words)
  y.append(target_word)



100%|██████████| 115636/115636 [01:24<00:00, 1368.69it/s]


In [None]:
X = np.array(X)
y = np.array(y)

# Define and train the CBOW model
embedding_dim = 300  # Dimensionality of word embeddings
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=window_size*2),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(vocabulary_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(X, y, epochs=1, batch_size=512)


# Extract word embeddings
word_embeddings = model.layers[0].get_weights()[0]

# Save word embeddings as .txt file
with open('word.txt', 'w', encoding='utf-8') as f:
  for word, embedding in zip(tokenizer.word_index.keys(), word_embeddings):
    embedding_str = ' '.join(str(val) for val in embedding)
    f.write(f'{word} {embedding_str}\n')