In [1]:
!pip -qqq install pinecone-client --progress-bar off
!pip -qqq install sentence_transformers --progress-bar off
!pip -qqq install langchain_community --progress-bar off
!pip -qqq install fitz --progress-bar off
!pip -qqq install pymupdf
!pip -qqq install python-dotenv --progress-bar off

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-cv 0.8.2 requires keras-core, which is not installed.
keras-nlp 0.8.2 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.

In [3]:
from pinecone import Pinecone, PodSpec
import os
import numpy as np
from langchain_community.document_loaders import PyMuPDFLoader
from sentence_transformers import SentenceTransformer
import re
from dotenv import load_dotenv

In [None]:
# Load .env file from the parent directory
load_dotenv('../.env')
PINE_CONE_API = os.getenv("PINE_CONE_API")
TOP_K = 5
MODEL_NAME = 'mixedbread-ai/mxbai-embed-large-v1'
BATCH_SIZE = 1

In [4]:
# Initialize Pinecone
pc = Pinecone(api_key=PINE_CONE_API)
index = pc.Index('papers-index')

In [12]:
def generate_embeddings(texts, model_name = MODEL_NAME, batch_size=BATCH_SIZE):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=False, convert_to_numpy=True)
    return embeddings


In [16]:
def query_pinecone(embedding, top_k=TOP_K):
    results = []
    try:
        query_result = index.query(vector=embedding.tolist(), top_k=top_k, include_metadata=True)
        results.append(query_result)
    except Exception as e:
        print(f"Error querying with embedding: {e}")
    return results


In [10]:
def extract_text (FILE_PATH):
    try:
        loader = PyMuPDFLoader(FILE_PATH)
        data = loader.load()
        text = data[0].page_content 
    except:
        print("ERROR: Could not load PDF")
        return ""
    start_index = text.find('Abstract')
    end_index = text.find('Introduction')
    extracted_text = ""
    if start_index != -1 and end_index != -1:
        extracted_text = text[start_index:end_index]
    elif end_indx != -1:
        extracted_text = text[:end_index]
    else:
        tokens = text.split(' ')[:300] 
        # get first 300 words by default.
        # abstract usually has a max limit of 250 words
        extracted_text = ' '.join(tokens)
    return extracted_text



In [8]:
FILE_PATH = '/kaggle/input/llms-in-machine-translation/LLMS_in_Machine_Translation.pdf'

In [17]:
data = extract_text(FILE_PATH)
if len(data) > 5:
    embeddings = generate_embeddings(data)
    query_results = query_pinecone(embeddings)
    print(query_results)

[{'matches': [{'id': '10.1109/TASL.2013.2245649',
              'metadata': {'authors': 'Ciprian Chelba, Peng Xu, Fernando '
                                      'Pereira, Thomas Richardson',
                           'doi': '10.1109/TASL.2013.2245649',
                           'latest_creation_date': '2013-02-05 17:09:49',
                           'title': 'Large Scale Distributed Acoustic Modeling '
                                    'With Back-off N-grams'},
              'score': 0.818493128,
              'values': []},
             {'id': '10.1609/aaai.v34i05.6448',
              'metadata': {'authors': 'Yu Wan and Baosong Yang and Derek F. '
                                      'Wong and Lidia S. Chao and Haihua\n'
                                      '  Du and Ben C.H. Ao',
                           'doi': '10.1609/aaai.v34i05.6448',
                           'latest_creation_date': '2019-12-11 06:21:16',
                           'title': 'Unsupervised Neural Diale