In [11]:
import os
import faiss
import numpy as np
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import pyarrow as pa
import pyarrow.parquet as pq
import sys
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('.'))))  # /src

from semantic_search.misc import LocalEmbeddingModel, FAISSDocumentStore



### Create DB

In [1]:
from importlib import reload
from semantic_search import misc
reload(misc)
from semantic_search.misc import LocalEmbeddingModel, FAISSDocumentStore



In [27]:
embedding_model = LocalEmbeddingModel(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    device='cuda',
    batch_size=8
)
document_store = FAISSDocumentStore(
    embedding_model=embedding_model,
    db_dir='/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/db/chunk1-txt-all',
)

if not document_store.load_index():
    document_store.create_index('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt')

Loaded index with 73809 vectors


In [26]:
print(document_store.search("visual", top_k=5)[0]['chunk_text'])

er-efficient visual instruction model. arXiv 2304.15010 , 2023. 1, 3, 4
- [9] Yash Goyal, Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. Making the V in VQA matter: Elevating the role...


### Ground truth citations

In [15]:
# Set pandas display options to show wider dataframes
pd.set_option('display.max_colwidth', None)  # Show full text in columns
pd.set_option('display.width', 1000)         # Set the display width
pd.set_option('display.max_columns', 20)     # Show more columns

In [49]:
import re
import pyalex
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial

raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'

df = pd.DataFrame([(str(fpath), fpath.name) for fpath in Path(raw_dir).glob("*.txt")], columns=['fpath', 'fname'])

def get_title(fpath: str):
    doc_text = Path(fpath).read_text(encoding="utf-8")
    title_match = re.search(r'## ([^\n#]+)', doc_text)
    return title_match.group(1) if title_match else None

def get_metadata(title: str):
    search_results = pyalex.Works().search(title).select(['id', 'doi', 'referenced_works']).get(page=1, per_page=1)
    return (search_results[0]['doi'], search_results[0]['id'], search_results[0]['referenced_works']) if search_results else (None, None, None)

df['title'] = df['fpath'].apply(get_title)

with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(tqdm(
        executor.map(get_metadata, df['title'].values),
        total=len(df)
    ))
df['doi', 'oaid', 'referenced_works'] = results

  0%|          | 0/1142 [00:00<?, ?it/s]

Unnamed: 0,fpath,fname,title,doi,oaid
0,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Bai_Fixed_Point_Diffusion_Models_CVPR_2024_paper.txt,Fixed Point Diffusion Models,https://doi.org/10.1063/1.2121687,https://openalex.org/W2000456051
1,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,Li_BEVNeXt_Reviving_Dense_BEV_Frameworks_for_3D_Object_Detection_CVPR_2024_paper.txt,BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection,https://doi.org/10.1109/cvpr52733.2024.01901,https://openalex.org/W4402727763
2,/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Savchenko_Leveraging_Pre-trained_Multi-task_Deep_Models_for_Trustworthy_Facial_Analysis_in_CVPRW_2024_paper.txt,Leveraging Pre-trained Multi-task Deep Models for Trustworthy Facial Analysis in Affective Behaviour Analysis in-the-Wild,https://doi.org/10.1109/cvprw63382.2024.00473,https://openalex.org/W4402916217


In [94]:
df = pd.concat([df, pd.DataFrame(results, columns=['doi', 'oaid', 'referenced_works'])], axis=1)

In [96]:
df.to_csv('/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/src/semantic_search/dev/paper-metadata.csv', index=False)

### Misc

In [5]:
import re

raw_dir = '/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt'

def check_single_doc(fpath: str):
    doc_text = fpath.read_text(encoding="utf-8")
    abstract_match = re.search(r'## Abstract\n\n(.*?)(?=\n\n## \d+\.)', doc_text, re.DOTALL)
    return abstract_match.group(1) if abstract_match else None

for fpath in Path(raw_dir).glob("*.txt"):
    if check_single_doc(fpath) is None:
        print(fpath)

# abstracts =[check_single_doc(fpath) for fpath in Path(raw_dir).glob("*.txt")]

/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Korycki_Class-Incremental_Mixture_of_Gaussians_for_Deep_Continual_Learning_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Hoque_IrrNet_Spatio-Temporal_Segmentation_Guided_Classification_for_Irrigation_Mapping_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Zhang_MOHO_Learning_Single-view_Hand-held_Object_Reconstruction_with_Multi-view_Occlusion-Aware_Supervision_CVPR_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Paissan_Structured_Sparse_Back-propagation_for_Lightweight_On-Device_Continual_Learning_on_Microcontroller_CVPRW_2024_paper.txt
/cluster/home/lcarretero/workspace/dsl/dsl-research-assistant/raw-data/Conversions/opencvf-data/txt/Halawa_Multi-Task_Multi-Modal_Self-Super

In [4]:
pd.Series(str(type(a)) for a in abstracts).value_counts()

<class 'str'>         1133
<class 'NoneType'>       9
Name: count, dtype: int64

In [21]:
# Example search function
def search_documents(query, top_k=5):
    results = document_store.search(query, top_k=top_k)
    print(f"Search results for: '{query}'")
    for result in results:
        print(f"Rank {result['rank']} (Score: {result['score']:.4f})")
        print(f"Document: {result['document_name']}")
        print(f"Preview: {result['chunk_text']}")
        print("-" * 80)
    return results

# Test the search
search_results = search_documents("visual odometry in robotics applications", top_k=3)


Search results for: 'visual odometry in robotics applications'
Rank 1 (Score: 0.7618)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt
Preview: omain (real data) while achieving state-of-the-art performance on the KITTI dataset.

## 1. Introduction

Visual odometry (VO) is a crucial aspect of robotics that enables machines to measure the ego-...
--------------------------------------------------------------------------------
Rank 2 (Score: 0.7586)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt
Preview: current neural network for (un-) supervised learning of monocular video visual odometry and depth. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 5555-556...
--------------------------------------------------------------------------------
Rank 3 (Score: 0.7444)
Document: Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt
Preview: This CVPR Worksh

In [None]:
document_store.search("visual odometry in robotics applications"*100000, top_k=3)

[{'rank': 1,
  'score': np.float32(0.506165),
  'document_id': 0,
  'document_name': 'Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'document_path': 'example_data/Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'chunk_text': 'omain (real data) while achieving state-of-the-art performance on the KITTI dataset.\n\n## 1. Introduction\n\nVisual odometry (VO) is a crucial aspect of robotics that enables machines to measure the ego-...'},
 {'rank': 2,
  'score': np.float32(0.47257516),
  'document_id': 0,
  'document_name': 'Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'document_path': 'example_data/Abouee_Weakly_Supervised_End2End_Deep_Visual_Odometry_CVPRW_2024_paper.txt',
  'chunk_text': 'ons on robotics , 33(5):1255-1262, 2017. 2, 6\n- [14] David Nist´ er, Oleg Naroditsky, and James Bergen. Visual\n\n- odometry. In Proceedings of the 2004 IEEE Computer Society Conference on Computer Visi...'