In [1]:
# BERT Cosine Similarity Search Engine
# Sentence like model

import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
from datasets import load_dataset

# Load BERT embeddings
bert_data = np.load("BERT embeddings/bert_embedding.npz")
bert_embeddings = normalize(bert_data["bert_embedding"])

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

# Function to embed a query
def embed_query(query):
    with torch.no_grad():
        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state[:, 0, :].numpy()

# Function to perform search
def search_bert(query, top_k=5):
    query_vec = normalize(embed_query(query))
    sims = cosine_similarity(query_vec, bert_embeddings).flatten()
    top_indices = sims.argsort()[::-1][:top_k]
    return top_indices, sims[top_indices]

# Load the dataset text
dataset = load_dataset("ccdv/arxiv-classification", "no_ref")["train"][:]["text"]

# Test search
query = "machine learning for vision"
indices, scores = search_bert(query, top_k=5)

# Print results
for i, score in zip(indices, scores):
    print(f"\n--- Result {i} (Score: {score:.4f}) ---\n{dataset[i][:500]}...")



--- Result 17391 (Score: 0.8350) ---
Effective Quotation
Relating approaches to language-integrated query
James Cheney

Sam Lindley

arXiv:1310.4780v3 [] 11 Apr 2014

The University of Edinburgh
jcheney@inf.ed.ac.uk,
Sam.Lindley@ed.ac.uk

Gabriel Radanne

Philip Wadler

ENS Cachan
gabriel.radanne@zoho.com

The University of Edinburgh
wadler@inf.ed.ac.uk

Abstract
Language-integrated query techniques have been explored in a
number of different language designs. We consider two different, type-safe approaches employed by Links and F#...

--- Result 25653 (Score: 0.8049) ---
Lienert 1

Simulation of Genetic
Algorithm: Traffic
Light Efficiency
Senior Research Paper
By: Eric Lienert

Lienert 2

Abstract:
Traffic is a problem in many urban areas worldwide. Traffic flow is dictated by certain devices
such as traffic lights. The traffic lights signal when each lane is able to pass through the
intersection. Often, static schedules interfere with ideal traffic flow. The purpose of this project

In [None]:
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from datasets import load_dataset

# Load Word2Vec model
# Make sure the .model and .vectors.npy files are in the same folder
wv_model = KeyedVectors.load("models/word2vec-google-news-300.model")

# Load Dataset
dataset = load_dataset("ccdv/arxiv-classification", "no_ref")["train"][:1000]  # Limit for speed
documents = dataset["text"]

# Function to embed a document
def document_to_w2v(doc, model):
    tokens = [word for word in doc.split() if word in model]
    if not tokens:
        return np.zeros(model.vector_size)
    return np.mean([model[token] for token in tokens], axis=0)

# Build Document Embedding Matrix
doc_embeddings = np.array([document_to_w2v(doc, wv_model) for doc in documents])
doc_embeddings = normalize(doc_embeddings)

# Function to embed a query
def query_to_w2v(query, model):
    return normalize(document_to_w2v(query, model).reshape(1, -1))

# Function to search
def search_word2vec(query, top_k=5):
    query_vec = query_to_w2v(query, wv_model)
    sims = cosine_similarity(query_vec, doc_embeddings).flatten()
    top_indices = sims.argsort()[::-1][:top_k]
    return [(i, sims[i], documents[i]) for i in top_indices]

# Test search
query = "deep learning for robotics"
results = search_word2vec(query, top_k=5)

# Display results
for i, score, doc in results:
    print(f"\n--- Result {i} (Score: {score:.4f}) ---\n{doc[:500]}...")



--- Result 602 (Score: 0.5626) ---
Double Deep Machine Learning
Moshe BenBassat (moshe.benbassat@plataine.com)

Arison School of Business, Interdisciplinary Center (IDC), Herzliya, Israel

Abstract
Very important breakthroughs in data-centric machine learning algorithms led to impressive performance in ‘transactional’
point applications such as detecting anger in speech, alerts from a Face Recognition system, or EKG interpretation. Nontransactional applications, e.g. medical diagnosis beyond the EKG results, require AI algorithms...

--- Result 214 (Score: 0.5540) ---
Human-in-the-loop Artificial Intelligence
Fabio Massimo Zanzotto
University of Rome Tor Vergata

arXiv:1710.08191v1 [] 23 Oct 2017

fabio.massimo.zanzotto@uniroma2.it

Abstract
Little by little, newspapers are revealing the bright future that Artiﬁcial Intelligence (AI) is building. Intelligent machines will help everywhere. However, this
bright future has a dark side: a dramatic job market contraction before its unpredi