In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

MODEL = "gpt-3.5-turbo"
# MODEL = "mixtral:8x7b"
MODEL = "llama3"

In [2]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.llms import Ollama

# if MODEL.startswith('gpt'):
#     model = ChatOpenAI(api_key = OPENAI_API_KEY, model = MODEL)
# else:
#     model = Ollama(model = MODEL)
    
# model.invoke("Tell me a joke")

In [3]:
import pandas as pd

file_path = '../arxiv-metadata-oai-snapshot.json'

# Columns to read from the JSON file
columns_to_read = ['id','title', 'authors', 'abstract']  # Specify the column names you want to read

# Chunk size for reading the file
chunk_size = 100 # Adjust the chunk size based on your available memory and file size

# Initialize an empty list to store chunks of DataFrames
dfs = []

# Iterate over chunks of the JSON file and read only the specified columns
for chunk in pd.read_json(file_path, lines=True, chunksize=chunk_size, nrows=1000):
    chunk = chunk[columns_to_read]
    dfs.append(chunk)

# Concatenate the list of DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
# file = pd.read_json("../arxiv-metadata-oai-snapshot.json", lines=True)

In [4]:
# df.shape

In [5]:
# df.head()

In [6]:
# df.iloc[1,2]

In [7]:
import json

# Number of records to read
num_records = 1000  # Specify the number of records you want to read

# Initialize an empty list to store the extracted records
records = []

# Open the JSON file and read records
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        # Load each line as a JSON object
        record = json.loads(line)
        
        # Extract only the specified columns
        extracted_record = {key: record[key] for key in columns_to_read}
        
        # Append the extracted record to the list
        records.append(extracted_record)
        
        # Check if the desired number of records has been reached
        if len(records) >= num_records:
            break

In [8]:
df = pd.DataFrame(records)
df = df.replace(r'\n', ' ', regex=True)
df.head()

Unnamed: 0,id,title,authors,abstract
0,704.0001,Calculation of prompt diphoton production cros...,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",A fully differential calculation in perturba...
1,704.0002,Sparsity-certifying Graph Decompositions,Ileana Streinu and Louis Theran,"We describe a new algorithm, the $(k,\ell)$-..."
2,704.0003,The evolution of the Earth-Moon system based o...,Hongjun Pan,The evolution of Earth-Moon system is descri...
3,704.0004,A determinant of Stirling cycle numbers counts...,David Callan,We show that a determinant of Stirling cycle...
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,Wael Abu-Shammala and Alberto Torchinsky,In this paper we show how to compute the $\L...


In [9]:
pd.set_option('display.max_colwidth', None)
df['title'].head(25)

0                                            Calculation of prompt diphoton production cross sections at Tevatron and   LHC energies
1                                                                                           Sparsity-certifying Graph Decompositions
2                                                The evolution of the Earth-Moon system based on the dark matter field   fluid model
3                                          A determinant of Stirling cycle numbers counts unlabeled acyclic   single-source automata
4                                                                               From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\alpha}$
5                                                                         Bosonic characters of atomic Cooper pairs across resonance
6                                                                                  Polymer Quantum Mechanics and its Continuum Limit
7                                                 Numerical solution 

In [10]:
df.shape

(1000, 4)

In [11]:
from langchain.embeddings import FakeEmbeddings
from langchain_openai import OpenAIEmbeddings

fake_embeddings = FakeEmbeddings(size=1536)
fake_embeddings = OpenAIEmbeddings()

fake_embeddings_list = []
for abstract in df['abstract']:
    fake_embeddings_list.append(fake_embeddings.embed_query(abstract))

In [12]:
fake_embeddings_list[0][:15]

[-0.019489128751307233,
 0.007486256538949653,
 -0.021519534961965334,
 -0.016367553313181165,
 0.0020338583907193804,
 0.024461551402097298,
 -0.016008435317192957,
 -0.02182340503845662,
 -0.052320925290386094,
 -0.01647805287746542,
 0.019793000690443728,
 0.024779232992801513,
 -0.02839805088227111,
 0.004216198941131173,
 -3.9845196959817e-05]

In [13]:
import numpy as np
fake_embeddings_list = np.array(fake_embeddings_list).astype("float32")

In [14]:
import faiss

index = faiss.IndexFlatL2(1536)
index.add(fake_embeddings_list)

In [15]:
test_sample = fake_embeddings.embed_query("biology, plant, species")
test_sample = np.array([test_sample]).astype("float32")

In [16]:
k = 5
distances, indices = index.search(test_sample, k)

In [17]:
indices

array([[303,  33, 321, 511, 831]], dtype=int64)

In [18]:
np.array(df[['title', 'abstract']])[indices]

array([[['The World as Evolving Information',
         '  This paper discusses the benefits of describing the world as information, especially in the study of the evolution of life and cognition. Traditional studies encounter problems because it is difficult to describe life and cognition in terms of matter and energy, since their laws are valid only at the physical scale. However, if matter and energy, as well as life and cognition, are described in terms of information, evolution can be described consistently as information becoming more complex.   The paper presents eight tentative laws of information, valid at multiple scales, which are generalizations of Darwinian, cybernetic, thermodynamic, psychological, philosophical, and complexity principles. These are further used to discuss the notions of life, cognition and their evolution. '],
        ['Origin of adaptive mutants: a quantum measurement?',
         '  This is a supplement to the paper arXiv:q-bio/0701050, containing the te

In [19]:
faiss.write_index(index, 'abstractIndex.index')

In [20]:
df.to_csv("ThousandCornellAbstracts.csv")