### Vector storage

content= text /image summary / table summary

metadata 

{
    topic:
    type:
    image_url{optional}:
    json_data{optional}:
    
}



In [None]:
import getpass
import base64
from PIL import Image
from io import BytesIO
from uuid import uuid4
from langchain_core.documents import Document
import os
from langchain_openai import AzureOpenAIEmbeddings
from langchain_astradb import AstraDBVectorStore
import csv
import json
import os
import re

from langchain_text_splitters import RecursiveCharacterTextSplitter


os.environ["OPENAI_API_VERSION"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_KEY"] = ""

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="embeddingada",
    openai_api_version="2024-02-01",
)


vector_store = AstraDBVectorStore(
    collection_name="rpf_data",
    embedding=embeddings,
    api_endpoint="",
    token="",
    namespace="default_keyspace",
)

desired_namespace = getpass.getpass("RPF")


In [24]:

def create_page_docs(directory,topic):
    try:
    ## Returns the text content of the document
        # Regex pattern to match the file format page_{n}.txt
        pattern = re.compile(r"^page_\d+\.txt$")

        # Iterate over files in the directory
        for filename in os.listdir(directory):
            if pattern.match(filename):  # Check if the file matches the pattern
                file_path = os.path.join(directory, filename)
                with open(file_path, 'r',encoding='utf-8', errors='ignore') as file:
                    content = file.read().replace("\n"," ")
                    # print(f"Content of {filename}:\n{content}\n")
        # doc=Document(content)
        # doc.metadata['type']="text"

        text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=250,
        chunk_overlap=30,
        length_function=len,
        is_separator_regex=False,)

        # print(Document(content))

        splitted_docs=text_splitter.split_documents([Document(content)])

        page_docs=[]
        for doc in splitted_docs:
            doc.metadata['type']="text"
            doc.metadata['topic']=topic

            page_docs.append(doc)


        return page_docs
    except Exception as e:
        print(e)

In [25]:
import os
import re
from langchain_core.documents import Document

def create_image_data_docs(directory, topic):
     # Replace with the actual import for `Document`
    try:
    # Regex patterns to match summary and base64 files with any page number
        summary_pattern = re.compile(r"^page_(\d+)_img_(\d+)_summary\.txt$")
        base64_pattern = re.compile(r"^page_(\d+)_img_(\d+)_base64\.txt$")

        # Dictionary to store summaries and base64 file paths
        data_dict = {}

        # Iterate over files in the directory
        for filename in os.listdir(directory):
            summary_match = summary_pattern.match(filename)
            base64_match = base64_pattern.match(filename)
            
            if summary_match:
                page_num, img_num = summary_match.groups()
                key = f"{page_num}_{img_num}"
                with open(os.path.join(directory, filename), 'r') as file:
                    if key not in data_dict:
                        data_dict[key] = {"summary": "", "base64_path": ""}
                    data_dict[key]["summary"] = file.read()
            
            elif base64_match:
                page_num, img_num = base64_match.groups()
                key = f"{page_num}_{img_num}"
                base64_path = os.path.join(directory, filename)
                if key not in data_dict:
                    data_dict[key] = {"summary": "", "base64_path": ""}
                data_dict[key]["base64_path"] = base64_path

        # Convert the dictionary to a list of dictionaries
        result = [value for key, value in data_dict.items() if value["summary"] and value["base64_path"]]

        image_data = []
        for dict_item in result:
            data = Document(dict_item["summary"])
            data.metadata["base64_path"] = dict_item["base64_path"]
            data.metadata["type"] = "image"
            data.metadata["topic"] = topic
            image_data.append(data)

        return image_data
    except Exception as e:
        print(e)


In [26]:


def create_csv_docs(directory,topic):
    # Regex patterns to match summary and csv files
    try:
        summary_pattern = re.compile(r"^page_(\d+)_table_(\d+)_summary\.txt$")
        csv_pattern = re.compile(r"^page_(\d+)_table_(\d+)\.csv$")

        # Dictionary to store summaries and CSV data
        data_dict = {}

        # Iterate over files in the directory
        for filename in os.listdir(directory):
            summary_match = summary_pattern.match(filename)
            csv_match = csv_pattern.match(filename)
            
            if summary_match:
                page_num, table_num = summary_match.groups()
                key = f"{page_num}_{table_num}"
                with open(os.path.join(directory, filename), 'r') as file:
                    if key not in data_dict:
                        data_dict[key] = {"summary": "", "csv_data": ""}
                    data_dict[key]["summary"] = file.read()
            
            elif csv_match:
                page_num, table_num = csv_match.groups()
                key = f"{page_num}_{table_num}"
                with open(os.path.join(directory, filename), 'r') as file:
                    reader = csv.DictReader(file)
                    csv_data = [row for row in reader]  # Convert CSV rows to a list of dictionaries
                    if key not in data_dict:
                        data_dict[key] = {"summary": "", "csv_data": ""}
                    data_dict[key]["csv_data"] = json.dumps(csv_data)  # Convert to JSON string

        # Convert the dictionary to a list of dictionaries
        result = [value for key, value in data_dict.items() if value["summary"] and value["csv_data"]]

        csv_docs=[]
        for csv_data in result:
            doc=Document(csv_data['summary'])
            doc.metadata["table_data"]=csv_data["csv_data"]
            doc.metadata['type']="table"
            doc.metadata["topic"]=topic
            csv_docs.append(doc)
        return csv_docs
    except Exception as e:
        print(e)

# Print the result


In [27]:
# D:\Hackthon\NetApp StorageGrid\Data to be added\1.Get_started_with_StorageGRID system
# Function to work on the page folder

def form_documents_from_page(page_folder,topic):

    page_docs=create_page_docs(directory=page_folder,topic=topic)

    image_docs=create_image_data_docs(directory=page_folder,topic=topic)

    csv_docs=create_csv_docs(directory=page_folder,topic=topic)


    return page_docs+image_docs+csv_docs





In [9]:
# page_folder=r"Extracted_Data\1.Get_started_with_StorageGRID system\page_19"
# topic="get started with storageGRID system"

# form_documents_from_page(page_folder,topic)

In [28]:

def embed_docs(page_folder_path,topic):

    docs=form_documents_from_page(page_folder=page_folder_path,topic=topic)

    uuids = [str(uuid4()) for _ in range(len(docs))]

    vector_store.add_documents(documents=docs, ids=uuids)

In [40]:
import time
from pathlib import Path
base_path = Path(r'Extracted_Data\8.Recovery or Replace Nodes')



absolute_folders = [str(f) for f in base_path.rglob('*') if f.is_dir()]
# print(absolute_folders)

topic = "Recovery or Replace Nodes"


In [41]:
i=0
for folder in absolute_folders: 
    try:
        embed_docs(page_folder_path=folder, topic=topic)
        i=i+1
        print(i)
      # Exit the loop if the function succeeds
    except Exception as e:
        print(f"Error processing folder '{folder}': {e}. Retrying in 60 seconds...")
            # time.sleep(60)  # Wait for 60 seconds before retrying

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113


In [3]:
import base64
import zlib

# Read the base64 string from a file
with open(r'Extracted_Data\1.Get_started_with_StorageGRID system\page_19\page_19_img_1_base64.txt', 'r') as file:
    base64_string = file.read()

print("Original size:", len(base64_string))

# Compress using zlib
compressed_data = zlib.compress(base64_string.encode())

# Encode the compressed data back to base64 (optional)
compressed_base64 = base64.b64encode(compressed_data).decode()
print(compressed_base64)

print("Compressed size:", len(compressed_base64))

# # Save the compressed string to a file
# with open('compressed_file.txt', 'w') as file:
#     file.write(compressed_base64)


Original size: 13492
eJyleteyq1i25QfxgHePCIQQXnh4wwrv/df32iezKk9m1Y2+Ha0IxUZoLjfNmGOgDbM1TH854+Nbe9i5XMT9ej1gduE+B2fw30Xnj8YQPof+5BjjaYWNeIau+AiXN/+MZeGz6Xd4q/X7MhrRseqBdWri0pqysl3S90U0D5xSjDq09bvT8VE0+mPup/l8cP7zDCfxTCffWS4DbSPf+/97wwzHPT9fbvuKO3c8K457W9xHs7gCfnIP8CX3FLkPWPnn/dfrwX215+MTSjzXKDwMbFX3A8Y+uEPjH5wmPj6f549d8Xny3ME9nx/Xfjvi41HqVvhJq9T6XN/oVfK8zi/p03NX6ZAFvUqHt8yXYfB6NLFsh73CN5MuuJh+f0mj0Z+WG0n2s1Vc1PK8Ngt9b6yjl9dFWDnGvY5kQYTnUkt9CVssh/xdhZPdkLbXjnnYRVM1UHY7TvkwxdO10Da6zjmxJTN/MI54LsX7SmcbYR0PXYsQy+aS2MyGzJOeMvqJKdSbNQOMhWsS2n/5B/hEe/zbP7+O/D/6x/rlny8wOj7SH/558tnxy9Z6cG6pfUL74z2lMHg671p58A+Lb+LXeXyaWkPVQ/TlC/kMviw4pVw8//AR8JhiD6hep4ThjIL19F4WUsp2q7ueFwW+2MYhZrVRlw2xP96Z5GEZXpL5oAjl23+VRCVXo+K0su+3ZBV1k1KPit+NVDVOs3Kvqo+tdEVuiyqcWvA6mVq+VtVB9cBH2TrCNrUmjaAjoXqk9r/5J+a4Q+A+HP/8ySeB4Vh5AC9OUcZ4nzSjoYI0QCC7UUisnk6KK6qYnBIffrLzOKA5seiGXq/P+qPQUDpJm1208I0MHK1Um7UXopatM9ZfNv4tkY5YauNzCbJsSolRhN4Q10RZGt/aXbWNtob7PfF2Fii1fHw+A87BpQUHFJegYojCxIAEznwADxTneTORq9xJCo3cWyJQfcr3OBi11G3Z3Dp