In [2]:
!pip install faiss-gpu-cu11==1.10.0

Collecting faiss-gpu-cu11==1.10.0
  Downloading faiss_gpu_cu11-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<2 (from faiss-gpu-cu11==1.10.0)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading faiss_gpu_cu11-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.8/47.8 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nu

In [1]:
import os
import faiss
import pickle
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

def create_vector_store_from_text_files():
    source_dir = 'reliance_source_documents'
    output_dir = 'reliance_report_vector_store'

    model_name = 'GIST-large-Embedding-v0'

    print(f"--- Starting Vector Store Creation from Text Documents in '{source_dir}' ---")

    try:
        all_docs = []
        print(f"Scanning for .txt files in '{source_dir}'...")
        if not os.path.exists(source_dir):
             raise FileNotFoundError(f"Source directory not found: '{source_dir}'")
        
        for filename in os.listdir(source_dir):
            if filename.endswith(".txt"):
                file_path = os.path.join(source_dir, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    all_docs.append({'source': filename, 'content': f.read()})
        
        if not all_docs:
            print("No .txt files found. Exiting.")
            return
        
        print(f"Loaded {len(all_docs)} documents.")

        print("\nChunking documents into smaller pieces...")
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,     
            chunk_overlap=200,   
            length_function=len
        )
        
        chunked_documents = []
        for doc in tqdm(all_docs, desc="Chunking Documents"):
            chunks = text_splitter.split_text(doc['content'])
            for i, chunk_text in enumerate(chunks):
                chunked_documents.append({
                    'source': doc['source'],
                    'content': chunk_text,
                    'chunk_id': i
                })

        print(f"Split {len(all_docs)} documents into {len(chunked_documents)} chunks.")
        if not chunked_documents:
            print("No content to process after chunking. Exiting.")
            return
            
        print("Example chunk:", chunked_documents[0]['content'])
        
        documents_for_embedding = [chunk['content'] for chunk in chunked_documents]

        print(f"\nLoading sentence transformer model: '{model_name}'...")
        model = SentenceTransformer(model_name)

        print("Generating embeddings for all chunks... (This may be slow for many documents)")
        embeddings = model.encode(documents_for_embedding, show_progress_bar=True)
        
        d = embeddings.shape[1]
        print(f"Embeddings created with dimension: {d}")

        print("\nBuilding FAISS index...")
        index = faiss.IndexFlatL2(d)
        index.add(embeddings.astype('float32'))
        print(f"Index built successfully. Total vectors: {index.ntotal}")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        index_path = os.path.join(output_dir, 'reliance_reports.index')
        documents_path = os.path.join(output_dir, 'reliance_report_chunks.pkl')

        print(f"\nSaving FAISS index to: {index_path}")
        faiss.write_index(index, index_path)

        print(f"Saving chunked documents with metadata to: {documents_path}")
        with open(documents_path, 'wb') as f:
            pickle.dump(chunked_documents, f)

    except FileNotFoundError as e:
        print(f"Error: {e}. Please check your source directory path.")
    except Exception as e:
        import traceback
        traceback.print_exc()
        print(f"\nAn unexpected error occurred: {e}")

    print("\n--- Vector Store Creation from Text Files Completed ---")

if __name__ == '__main__':
    create_vector_store_from_text_files()



--- Starting Vector Store Creation for Reliance Industries ---
Loading data from reliance_master_dataset.csv...
Data loaded successfully.
Creating text documents from data rows...


Processing rows: 100%|██████████| 4900/4900 [00:00<00:00, 11096.23it/s]



Created 4900 text documents.
Example document: On 2005-02-21: Reliance Industries closing price was 36.03, with a high of 36.32 and a low of 35.84. The trading volume was 46948572. Average news sentiment score was 0.00. Average Reddit sentiment score was 0.00. Basic EPS was 36.27. The Earnings Yield was 0.07. Return on Equity was 15.96%. Total Debt to Equity ratio was 0.66. 

Loading sentence transformer model: 'all-MiniLM-L6-v2'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings for all documents... (This may take a moment)


Batches:   0%|          | 0/154 [00:00<?, ?it/s]

Embeddings created with dimension: 384

Building FAISS index...
Index built successfully. Total vectors in index: 4900

Saving FAISS index to: reliance_vector_store_faiss/reliance_faiss.index
Saving documents to: reliance_vector_store_faiss/reliance_documents.pkl

--- Vector Store Creation Completed ---
