In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# Load your dataset
df = pd.read_csv('C:\AIDI\SEM2\APPLIED MACHINE LEARNING\LAB3\data.csv', header=None, names=['text'])
# Extract words
words = df['text'].tolist()

In [None]:
# Tokenize the words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(words)
padded_sequences = pad_sequences(sequences)

In [None]:
embedding_dim = 16
vocab_size = len(word_index) + 1  # Add 1 because indexing starts from 1

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1),
])

# Compile the model
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'],
)


In [None]:
from sklearn.model_selection import train_test_split

# Dummy labels for training
labels = [0] * len(sequences)

# Split the data into training and validation sets
train_sequences, val_sequences, train_labels, val_labels = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Convert to numpy arrays
train_sequences = np.array(train_sequences)
val_sequences = np.array(val_sequences)
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)


In [None]:
# Train the model for one epoch
history = model.fit(
    train_sequences, train_labels, epochs=5, validation_data=(val_sequences, val_labels)
)


In [None]:
from sklearn.manifold import TSNE
# Extract embeddings
embeddings = model.layers[0].get_weights()[0]

# Use t-SNE for dimensionality reduction
tsne = TSNE(n_components=3, n_iter=1000, random_state=0)
embeddings_3d = tsne.fit_transform(embeddings)


In [None]:
import os
import tensorflow as tf
from tensorboard.plugins import projector

log_dir = 'logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)


metadata = tokenizer.word_index

# Write metadata to a TSV file
with open(os.path.join(log_dir, 'meta.tsv'), 'w') as f:
    for word, index in sorted(metadata.items(), key=lambda item: item[1]):
        f.write(f"{word}\n")

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(model.layers[0].get_weights()[0][1:])

# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint_path = checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Retrieve the tensor name for the embedding layer
embedding_tensor_name = model.layers[0].embeddings.name

# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"  # Use the correct tensor name
embedding.metadata_path = 'meta.tsv'  # Correct metadata path
projector.visualize_embeddings(log_dir, config)

# Write event files for TensorBoard
summary_writer = tf.summary.create_file_writer(log_dir)
with summary_writer.as_default():
    tf.summary.scalar('embedding/metadata', 0, step=0)

print(f"TensorBoard log directory: {log_dir}")