In [2]:
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

In [3]:
def load_documents(data):
    csv_loader = DirectoryLoader(data, glob='*.csv', loader_cls=CSVLoader)
    pdf_loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)

    csv_documents = csv_loader.load()
    pdf_documents = pdf_loader.load()

    combined_data = csv_documents + pdf_documents

    return combined_data

In [5]:
extracted_data = load_documents('/content/drive/MyDrive/final_data')

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_split(extracted_data):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap = 200)
  text_chunks = text_splitter.split_documents(extracted_data)
  return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 10174


In [8]:
text_chunks[1]

Document(page_content=': 1\nquestion: How does NLP relate to Computer Vision?\nanswer: NLP and Computer Vision often complement each other in tasks where understanding both textual and visual information is crucial. For example, in image captioning, the combination of NLP and Computer Vision is used to generate descriptive captions for images.', metadata={'source': '/content/drive/MyDrive/final_data/new_train.csv', 'row': 1})

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [11]:
embeddings = download_hugging_face_embeddings()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

create the DB

In [14]:
from langchain.vectorstores import Chroma

In [15]:
def create_db(text_chunks, embeddings):
  directory = 'database'
  vectordb = Chroma.from_documents(documents=text_chunks,
                                 embedding=embeddings,
                                 persist_directory=directory)
  return vectordb

In [16]:
vectordb = create_db(text_chunks, embeddings)

In [17]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [18]:
# Now we can load the persisted database from disk, and use it as normal.

directory = 'database'

vectordb = Chroma(persist_directory=directory,
                  embedding_function=embeddings)

Make a retriever

In [19]:
retriever = vectordb.as_retriever()

In [20]:
doc = retriever.get_relevant_documents("What is CNN")

In [21]:
doc

[Document(page_content=': 943\nquestion: What is the role of the term convolutional layer in convolutional neural networks (CNNs)?\nanswer: A convolutional layer in CNNs applies convolution operations to input data, extracting features such as edges, textures, and patterns. It involves learning a set of filters or kernels to convolve over the input, capturing hierarchical representations. Convolutional layers are fundamental in image processing tasks, enabling CNNs to automatically learn relevant features from visual data.', metadata={'row': 943, 'source': '/content/drive/MyDrive/final_data/new_train.csv'}),
 Document(page_content=': 11\nquestion: How does a Convolutional Neural Network (CNN) work in Computer Vision?\nanswer: A Convolutional Neural Network (CNN) is designed to process and recognize visual patterns in data. It uses convolutional layers to learn hierarchical features from images, enabling it to capture spatial relationships and patterns. CNNs are widely used in tasks lik

In [25]:
len(doc)

4

In [22]:
retriever = vectordb.as_retriever(search_kwargs={"k":2})

In [23]:
retriever.search_type

'similarity'

In [24]:
retriever.search_kwargs

{'k': 2}