In [2]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = data.data

"""
# Sample data
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "A quick brown dog outpaces a fast fox",
    "The quick brown fox",
    "Jumping over quick dogs"
]
"""

# Parameters
max_features = 10000  # vocabulary size
max_len = 10          # max length of text sequences
embedding_dim = 50    # dimensionality of embedding space
encoding_dim = 32     # dimensionality of the encoded representation

"""
# Tokenize text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(documents)
sequences = tokenizer.texts_to_sequences(documents)
data = pad_sequences(sequences, maxlen=max_len)
"""

# Split data
x_train, x_test = train_test_split(data, test_size=0.2, random_state=42)

# Building the autoencoder
input_text = Input(shape=(max_len,))
x = Embedding(max_features, embedding_dim, input_length=max_len)(input_text)
x = Conv1D(16, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
encoded = Dense(encoding_dim, activation='relu')(x)
decoded = Dense(embedding_dim, activation='sigmoid')(encoded)

from tensorflow.keras.layers import Flatten

# Modify the decoder part of the network
decoded = Dense(max_len, activation='sigmoid')(encoded)  # Assuming max_len is the size of the flattened input vector

# Now the autoencoder model output will match the input shape
autoencoder = Model(input_text, decoded)
encoder = Model(input_text, encoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.summary()

# Training
autoencoder.fit(x_train, x_train, epochs=50, batch_size=256, validation_data=(x_test, x_test))

# Predicting encoded documents
encoded_docs = encoder.predict(data)

# Example retrieval using cosine similarity
similarities = cosine_similarity([encoded_docs[0]], encoded_docs)
print(similarities)



KeyError: 4

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = data.data

# Parameters
max_features = 10000  # vocabulary size
max_len = 300         # max length of text sequences
embedding_dim = 50    # dimensionality of embedding space
encoding_dim = 32     # dimensionality of the encoded representation

# Tokenize text
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(documents)
sequences = tokenizer.texts_to_sequences(documents)
data = pad_sequences(sequences, maxlen=max_len)

# Split data
x_train, x_test = train_test_split(data, test_size=0.2, random_state=42)

# Building the autoencoder
input_text = Input(shape=(max_len,))
x = Embedding(max_features, embedding_dim, input_length=max_len)(input_text)
x = Conv1D(16, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
encoded = Dense(encoding_dim, activation='relu')(x)
decoded = Dense(max_len, activation='sigmoid')(encoded)  # Adjusted output to match input dimension

# Autoencoder model
autoencoder = Model(input_text, decoded)
encoder = Model(input_text, encoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.summary()

# Training
autoencoder.fit(x_train, x_train, epochs=50, batch_size=256, validation_data=(x_test, x_test))

# Predicting encoded documents
encoded_docs = encoder.predict(data)

# Example retrieval using cosine similarity
similarities = cosine_similarity([encoded_docs[0]], encoded_docs)
print(similarities)




Epoch 1/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 62ms/step - loss: -234.7127 - val_loss: -7236.7896
Epoch 2/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - loss: -43498.7891 - val_loss: -501942.3125
Epoch 3/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 58ms/step - loss: -1402137.3750 - val_loss: -7699042.5000
Epoch 4/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - loss: -13735545.0000 - val_loss: -43023788.0000
Epoch 5/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - loss: -62444180.0000 - val_loss: -142502528.0000
Epoch 6/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step - loss: -187545696.0000 - val_loss: -355237024.0000
Epoch 7/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 78ms/step - loss: -440649408.0000 - val_loss: -739887104.0000
Epoch 8/50
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [