In [90]:
import numpy as np
import json
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
from generate_dataset import word_set
from gvsm import perform_gvsm

In [91]:
file_paths = glob.glob('data/dataset_*.json')

all_data = []

for file_path in file_paths:
    with open(file_path, 'r') as file:
        data = json.load(file)
        all_data.append(data)


In [92]:
def preprocess(data):
    i =0
    result = []
    # all doc + query
    doc_strings = [" ".join(doc) for doc in data["documents"]+ [data["query"]]]
    # library tfidf.
    vectorizer = TfidfVectorizer(vocabulary=word_set)
    tfidf_matrix = vectorizer.fit_transform(doc_strings).toarray()

    actual_document = tfidf_matrix[:-1]
    actual_query = tfidf_matrix[-1]

    for x in actual_document:
        mx = list(x)
        for q in actual_query:
            mx.append(q)
        y = data["gvsm_ranked_docs"][i]
        result.append({"input":mx, "output":y})
        i+=1
    return result

In [93]:
X = []
Y = []

for data in all_data:
    result = preprocess(data)
    for r in result:
        X.append(r["input"])
        Y.append(r["output"])

X = np.array(X)
Y = np.array(Y)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [94]:

import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
model = models.Sequential([
    ## fixed input
    layers.Input(shape=(len( X[0] ),)),           # 64 input features
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1,activation='sigmoid') 
])

# mae cuz 0-1
model.compile(optimizer='adam', loss='mae')
model.summary()

In [95]:
# --- Train the Model ---
# Using validation_data so that the test set is evaluated at the end of each epoch
model.fit(X_train, Y_train, epochs=100, validation_data=(X_test, Y_test))

Epoch 1/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 552us/step - loss: 0.1018 - val_loss: 0.0500
Epoch 2/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step - loss: 0.0493 - val_loss: 0.0482
Epoch 3/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 326us/step - loss: 0.0456 - val_loss: 0.0479
Epoch 4/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step - loss: 0.0456 - val_loss: 0.0481
Epoch 5/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 361us/step - loss: 0.0484 - val_loss: 0.0477
Epoch 6/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391us/step - loss: 0.0474 - val_loss: 0.0478
Epoch 7/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 352us/step - loss: 0.0457 - val_loss: 0.0478
Epoch 8/100
[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357us/step - loss: 0.0488 - val_loss: 0.0476
Epoch 9/100
[1m

<keras.src.callbacks.history.History at 0x7f919010a320>

In [96]:
# --- Evaluate on the Test Set ---
test_loss = model.evaluate(X_test, Y_test)
print("Test loss:", test_loss)

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218us/step - loss: 0.0481
Test loss: 0.04796917364001274


## Test your doc

In [97]:
mystuff = {
    "documents":[
        ['bird', 'cat', 'bird', 'cat', 'dog', 'dog', 'bird'],
        ['cat', 'tiger', 'cat', 'dog'],
        ['dog', 'bird', 'bird'],
        ['cat', 'tiger'],
        ['tiger', 'tiger', 'dog', 'tiger', 'cat'],
        ['cat', 'cat', 'tiger', 'tiger'],
        ['bird', 'cat', 'dog'],
        ['dog', 'cat', 'bird'],
        ['cat', 'dog', 'tiger'],
        ['tiger', 'cat', 'tiger'],
        ['cat'],
        ['cat'],
        ['cat'],
        ['cat'],
        ['cat']
    ],
    "query":["cat","tiger","tiger","tiger"],
    "gvsm_ranked_docs":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,]
}

model_input = preprocess(mystuff)
cleanX = []
for x in model_input:
    cleanX.append(x["input"])
predicted_cosim = model.predict(np.array(cleanX))

ranked_docs = pd.DataFrame({
    "Document": [i+1 for i in range(len(predicted_cosim))],
    "CosineSimilarity": [cosim[0] for cosim in predicted_cosim],
}).sort_values(by="CosineSimilarity", ascending=False).reset_index(drop=True)
print("OUR MODEL")
print(ranked_docs)

print("REAL GVSM")
perform_gvsm({str(i+1): doc for i, doc in enumerate(mystuff["documents"])}, mystuff["query"])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
OUR MODEL
    Document  CosineSimilarity
0          4          0.998407
1          6          0.998407
2         10          0.997040
3          5          0.995476
4          2          0.994505
5          9          0.991090
6         11          0.871699
7         14          0.871699
8         13          0.871699
9         15          0.871699
10        12          0.871699
11         3          0.846014
12         1          0.840607
13         7          0.837287
14         8          0.837287
REAL GVSM

Ranked Documents by Cosine Similarity:
    Document  CosineSimilarity  Minterm
0        10          0.997369        4
1         5          0.979960        2
2         4          0.975516        4
3         6          0.975516        4
4         9          0.900074        2
5         2          0.888183        2
6        11          0.798895        5
7        14          0.798895        5
8        13         

  return freq_table.applymap(lambda x: int(x > 0))


array([0.48652646, 0.88818328, 0.25778659, 0.97551597, 0.97995983,
       0.97551597, 0.54665779, 0.54665779, 0.90007367, 0.99736876,
       0.79889477, 0.79889477, 0.79889477, 0.79889477, 0.79889477])