In [17]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.merge import MergedDataLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

import pandas as pd
import numpy as np

In [11]:
loader_1 = CSVLoader(
    file_path='../data/processed_data_part1.csv', 
    source_column='Abstract',
    metadata_columns=['PMID', 'Title', 'Authors', 'Publication Date', 'DOI'])

loader_2 = CSVLoader(
    file_path='../data/processed_data_part2.csv', 
    source_column='Abstract',
    metadata_columns=['PMID', 'Title', 'Authors', 'Publication Date', 'DOI'])

loader_all = MergedDataLoader(loaders=[loader_1, loader_2])
docs_all = loader_all.load()

print(len(docs_all))
docs_all[:2]

58850


[Document(page_content='Abstract: SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome CFS Since recent studies have shown that α1 antitrypsin AAT possesses antiinflammatory properties the potential therapeutic effect of AAT treatment on CFS has been investigated A 49yearold woman diagnosed with CFS was treated with intravenous infusions of a human plasmaderived AAT concentrate 60 mgkg body weight weekly for 8 consecutive weeks The patients monocyte elastase a regulator of inflammatory processes was 1170 Umg At completion of treatment improvement in maximal workload was observed 540717 of predicted Additionally amelioration in working memory scores 8394 and perceptual organization scores 7583 were detected on the Wechsler Adult Intelligence ScaleIII test Monocyte elastase decreased to a normal range 150 Umg Improvement in functional capacity allowed the patient to work in parttime employment These findings s

In [14]:
# Apply text splitting into chunks to prevent truncation of longer abstracts
# Chunk size: since BERT's max token length is 512, a safe chunk size might be 200-300
# Overlap size: to maintain context around 50-100 words

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=75)
docs = text_splitter.split_documents(docs_all)

docs[0]

Document(page_content='Abstract: SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome CFS Since recent studies have shown that α1 antitrypsin AAT possesses antiinflammatory properties the', metadata={'source': 'SUMMARY Several lines of evidence support the involvement of inflammatory and immunologic abnormalities in chronic fatigue syndrome CFS Since recent studies have shown that α1 antitrypsin AAT possesses antiinflammatory properties the potential therapeutic effect of AAT treatment on CFS has been investigated A 49yearold woman diagnosed with CFS was treated with intravenous infusions of a human plasmaderived AAT concentrate 60 mgkg body weight weekly for 8 consecutive weeks The patients monocyte elastase a regulator of inflammatory processes was 1170 Umg At completion of treatment improvement in maximal workload was observed 540717 of predicted Additionally amelioration in working memory scores 8394 and

In [None]:
# Try out SentenceTransformersTokenTextSplitter: 
# splits the text into chunks that fit the token window of the sentence transformer model

# from langchain.text_splitter import SentenceTransformersTokenTextSplitter

# splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0)

# count_start_and_stop_tokens = 2
# text = "Lorem "
# text_token_count = splitter.count_tokens(text=text) - count_start_and_stop_tokens
# print(text_token_count)

In [24]:
# Generate embeddings using different embedding models

embeddings_model = OpenAIEmbeddings(openai_api_key="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")

embeddings = embeddings_model.embed_documents(docs)

print(len(embeddings))
print(len(embeddings[0]))

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Convert the list of arrays to a numpy array
embeddings_array = np.array(embeddings)

# Save the embeddings to a .npy file
np.save("../embeddings/open_ai_embeddings.npy", embeddings_array)