In [None]:
from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import re
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss


  from tqdm.autonotebook import tqdm, trange


In [None]:
dataset_folder = '/content/drive/My Drive/Semantic/'

files = os.listdir(dataset_folder)
print("Files in dataset folder:", files)

train_path = os.path.join(dataset_folder, 'train.csv')
valid_path = os.path.join(dataset_folder, 'valid.csv')

train_data = pd.read_csv(train_path)
valid_data = pd.read_csv(valid_path)

print("Preview of train.csv:")
print(train_data.head())

print("\nPreview of valid.csv:")
print(valid_data.head())


Files in dataset folder: ['train.csv', 'valid.csv']
Preview of train.csv:
         Id                                              Title  \
0  34552656             Java: Repeat Task Every Random Seconds   
1  34553034                  Why are Java Optionals immutable?   
2  34553174  Text Overlay Image with Darkened Opacity React...   
3  34553318         Why ternary operator in swift is so picky?   
4  34553755                 hide/show fab with scale animation   

                                                Body  \
0  <p>I'm already familiar with repeating tasks e...   
1  <p>I'd like to understand why Java 8 Optionals...   
2  <p>I am attempting to overlay a title over an ...   
3  <p>The question is very simple, but I just cou...   
4  <p>I'm using custom floatingactionmenu. I need...   

                                                Tags         CreationDate  \
0                                     <java><repeat>  2016-01-01 00:21:59   
1                                   <j

In [None]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()

    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)

    return text.lower().strip()

train_data['Cleaned_Title'] = train_data['Title'].apply(clean_text)
train_data['Cleaned_Body'] = train_data['Body'].apply(clean_text)

valid_data['Cleaned_Title'] = valid_data['Title'].apply(clean_text)
valid_data['Cleaned_Body'] = valid_data['Body'].apply(clean_text)

print("Preview of cleaned train.csv:")
print(train_data[['Cleaned_Title', 'Cleaned_Body']].head())

print("\nPreview of cleaned valid.csv:")
print(valid_data[['Cleaned_Title', 'Cleaned_Body']].head())


  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  k = self.parse_starttag(i)
  self.handle_startendtag(tag, attrs)
  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()
  k = self.parse_starttag(i)


Preview of cleaned train.csv:
                                       Cleaned_Title  \
0              java repeat task every random seconds   
1                   why are java optionals immutable   
2  text overlay image with darkened opacity react...   
3          why ternary operator in swift is so picky   
4                  hideshow fab with scale animation   

                                        Cleaned_Body  
0  im already familiar with repeating tasks every...  
1  id like to understand why java 8 optionals wer...  
2  i am attempting to overlay a title over an ima...  
3  the question is very simple but i just could n...  
4  im using custom floatingactionmenu i need to i...  

Preview of cleaned valid.csv:
                                       Cleaned_Title  \
0  how to get all the child records from differen...   
1  retrieve all except some data of the another t...   
2                                   pandas read_html   
3                           reader always gimme 

In [None]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

def get_combined_text(row):
    return row['Cleaned_Title'] + " " + row['Cleaned_Body']

train_data['Combined_Text'] = train_data.apply(get_combined_text, axis=1)
valid_data['Combined_Text'] = valid_data.apply(get_combined_text, axis=1)

train_embeddings = model.encode(train_data['Combined_Text'].tolist(), show_progress_bar=True)
valid_embeddings = model.encode(valid_data['Combined_Text'].tolist(), show_progress_bar=True)

print("Shape of train embeddings:", np.shape(train_embeddings))
print("Shape of valid embeddings:", np.shape(valid_embeddings))

np.save('/content/train_embeddings.npy', train_embeddings)
np.save('/content/valid_embeddings.npy', valid_embeddings)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Batches:   0%|          | 0/1407 [00:00<?, ?it/s]

Batches:   0%|          | 0/469 [00:00<?, ?it/s]

Shape of train embeddings: (45000, 768)
Shape of valid embeddings: (15000, 768)


In [None]:
embedding_dimension = 768

index = faiss.IndexFlatL2(embedding_dimension)

index.add(np.array(train_embeddings, dtype=np.float32))

print(f"Number of embeddings in the index: {index.ntotal}")


Number of embeddings in the index: 45000


In [None]:
def search_similar_documents(query, k=5):
    query_cleaned = clean_text(query)

    query_embedding = model.encode([query_cleaned], show_progress_bar=False)

    D, I = index.search(query_embedding.astype(np.float32), k)

    print("\nTop 5 most similar documents:")
    for i, idx in enumerate(I[0]):
        print(f"Rank {i + 1}: Document ID {idx}, Similarity Score: {D[0][i]}")
        print(f"Document Title: {train_data.iloc[idx]['Title']}")
        print(f"Document Body: {train_data.iloc[idx]['Body'][:300]}...")
        print("="*80)


In [None]:
query = input("Enter a search query: ")

search_similar_documents(query, k=5)


Enter a search query: what is deep learning?

Top 5 most similar documents:
Rank 1: Document ID 15281, Similarity Score: 216.79537963867188
Document Title: Image preprocessing in deep learning
Document Body: <p>I am experimenting with deep learning on images. I have about ~4000 images from different cameras with different light conditions, image resolutions and view angle. </p>

<p>My question is: <strong>What kind of image preprocessing would be helpful for improving object detection?</strong> (For exa...
Rank 2: Document ID 39621, Similarity Score: 226.35037231445312
Document Title: How to generate new image using deep learning, from new features
Document Body: <p>If i have a dataset consisting by a list of images each associated with a series of features; there is a model that, once trained, generates new images upon entering a new list of features?</p>
...
Rank 3: Document ID 25707, Similarity Score: 241.96725463867188
Document Title: Computer Vision - Image Fundamentals
Document B