<a href="https://colab.research.google.com/github/NeuralNetworkNecromancer/Alrik-Abenteurer/blob/main/Alrik_Abenteurer_Rule_Data_Loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
####INITIATE####
!pip install qdrant-client openai google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client requests PyPDF2 pytz

# utils
import os
import json
import openai
from datetime import datetime
from IPython.display import display, clear_output, Markdown, HTML
from google.colab import drive, files, auth, userdata, output
import ipywidgets as widgets
import pytz

#data loading packages
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
import google.auth
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import PyPDF2
import re

from googleapiclient.http import MediaIoBaseDownload
import io

drive.mount('/content/drive')

# Specify folder and file details
folder_dir = f'/content/drive/MyDrive/DSA/Alrik Abenteurer'
app_dir = f'{folder_dir}/App'
targeted_folder = f'{folder_dir}/Rules'
company_name = 'alrik'

# Setup Openai
OPENAI_API_KEY = userdata.get('openai')
openai.api_key = OPENAI_API_KEY

# OAuth2.0 Authentication
SCOPES = ['https://www.googleapis.com/auth/drive']

# If modifying these SCOPES, delete the file token.json.
creds = None
if os.path.exists('token.json'):
    creds = Credentials.from_authorized_user_file('token.json', SCOPES)
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        auth.authenticate_user()
        creds, _ = google.auth.default()

# Build the Drive and Docs services
drive_service = build('drive', 'v3', credentials=creds)
docs_service = build('docs', 'v1', credentials=creds)
# Build the Google Sheets API client.
sheets_service = build('sheets', 'v4', credentials=creds)

clear_output()

In [None]:
####LOAD DATA####

# Function to recursively fetch files from a folder and its subfolders
def fetch_files_from_folder(folder_id, path=""):
    files_data = []
    # Fetch files in the current folder
    results = drive_service.files().list(
        q=f"'{folder_id}' in parents",
        fields="files(id, name, mimeType, parents)"
    ).execute()
    items = results.get('files', [])

    for item in items:
        # Check if item is a folder, recursively fetch its contents
        if item['mimeType'] == 'application/vnd.google-apps.folder':
            subfolder_id = item['id']
            subfolder_path = f"{path}/{item['name']}"
            files_data.extend(fetch_files_from_folder(subfolder_id, subfolder_path))
        else:
            # Fetch file details and content for non-folder items
            file_data = fetch_file_data(item, path)
            if file_data:
                files_data.append(file_data)

    return files_data

# Function to fetch data for a single file
def fetch_file_data(file_item, folder_path):
    try:
        file_id = file_item.get('id')
        file_name = file_item.get('name')
        file_url = f"https://docs.google.com/document/d/{file_id}"

        # Log the file name and URL
        print(f"Processing file: {file_name}, URL: {file_url}")

        # Fetch file content if it's a Google Doc
        if file_item['mimeType'] == 'application/vnd.google-apps.document':
            doc = docs_service.documents().get(documentId=file_id).execute()
            content_text = extract_text_from_doc(doc)

            return {
                "file_id": file_id,
                "file_name": file_name,
                "folder_path": folder_path,
                "file_url": file_url,
                "content_text": content_text
            }
    except Exception as e:
        print(f"Error processing file {file_item.get('name')}: {e}")
    return None

# Function to extract text from a Google Doc
def extract_text_from_doc(doc):
    content = []
    for element in doc.get('body', {}).get('content', []):
        content.extend([part.get('textRun', {}).get('content', '')
                        for part in element.get('paragraph', {}).get('elements', [])])
    return "".join(content)

# Adjust targeted_folder to be relative to the Google Drive root
relative_targeted_folder = targeted_folder.replace('/content/drive/MyDrive/', '')

# Function to fetch the ID of the root folder and then the specific target folder
def fetch_folder_id(target_folder):
    # Split the target folder path
    folder_names = target_folder.strip('/').split('/')

    # Start from the root of the drive
    parent_id = 'root'
    folder_id = None

    for folder_name in folder_names:
        query = f"name = '{folder_name}' and '{parent_id}' in parents and mimeType = 'application/vnd.google-apps.folder' and trashed = false"
        results = drive_service.files().list(
            q=query,
            fields="files(id, name, mimeType)"
        ).execute()

        items = results.get('files', [])
        if not items:
            print(f"Folder '{folder_name}' not found under parent ID {parent_id}.")
            return None

        folder_id = items[0]['id']  # Assume the first result is the correct one
        print(f"Found folder '{folder_name}' with ID: {folder_id}")
        parent_id = folder_id  # Set the current folder as the parent for the next iteration

    return folder_id

# Fetch the folder ID of the targeted folder
folder_id = fetch_folder_id(relative_targeted_folder)
if folder_id:
    # Fetch files data from the targeted folder and its subfolders
    files_data = fetch_files_from_folder(folder_id)

    # Save to a JSON file
    with open(f'{app_dir}/{company_name}_drive_data.json', 'w') as f:
        json.dump(files_data, f, indent=4)
    print(f"Saved data for {len(files_data)} files to {company_name}_drive_data.json")
else:
    print(f"Target folder '{relative_targeted_folder}' not found.")




In [None]:
####CREATE EMBEDDINGS####
import openai
import hashlib

# consistent means there is a risk of collusion when used for multiple customer datasets stored in the same environment. Solution is to add a customer unqiue value pre or post hashing
def consistent_hash(s):
    """Hashes a string and returns a consistent integer."""
    # Get a SHA256 hash of the string
    result = hashlib.sha256(s.encode()).hexdigest()
    # Convert the first 8 characters of the hash to an integer
    return int(result[:8], 16)  # Converts the hex to an integer

# Load the combined data
with open(f'{app_dir}/{company_name}_drive_data.json', 'r') as file:
    data = json.load(file)

# Set up OpenAI with the API key
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Function to create embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    # Use the instantiated client to create an embedding
    response = client.embeddings.create(input=text, model=model)
    # Access the embedding attribute directly
    embedding = response.data[0].embedding
    return embedding



# Create embeddings and prepare for upload
embeddings_data = []

for item in data:
    text = item.get("content_text") or ""
    text = text.strip()

    if not text:
        print(f"Skipped item '{item['file_name']}' due to empty 'content_text'.")
        continue

    # Check if ID is a number or alphanumeric and handle accordingly
    try:
        # Try converting it directly
        item_id = int(float(item["file_id"]))
    except ValueError:
        # If direct conversion fails, hash it
        item_id = consistent_hash(item["file_id"])

    # Create a shallow copy of the item to avoid mutating the original data
    item_payload = item.copy()

    embedding = get_embedding(text)
    embeddings_data.append({
        "id": item_id,
        "vector": embedding,
        "payload": item_payload  # Add the payload directly here
    })

    print(f"Successfully embedded item '{item['file_name']}' with original ID {item['file_id']} (hashed ID: {item_id}).")

# Save embeddings to JSON file
with open(f'{app_dir}/{company_name}_embeddings_data.json', 'w') as file:
    json.dump(embeddings_data, file, indent=4)

print(f"\n\n------------------------------------------------------------------\n\nSaved {len(embeddings_data)} embeddings to embeddings_data.json.\n\n")






In [None]:
# UPSERT TO VECTOR DB

from qdrant_client import QdrantClient
from qdrant_client.http import models
import json

# Setup Qdrant
QDRANT_API_KEY = userdata.get('qdrant')

qdrant_client = QdrantClient(
    url=userdata.get('TBU_qdrant_url'),
    api_key=QDRANT_API_KEY,
)

# Load the embeddings data from JSON file
with open(f'{app_dir}/{company_name}_embeddings_data.json', 'r') as file:
    embeddings_data = json.load(file)

# Function to split data into smaller batches
def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Batch size - you can adjust this based on your requirements and limitations
BATCH_SIZE = 30  # Example batch size

# Define the collection name you want to use
collection_name = f"{company_name}_embeddings"

try:
    # Fetch a list of all collections
    all_collections = qdrant_client.get_collections()

    # Check if the collection name exists in the list of collections
    if collection_name in all_collections:
        # If it exists, delete the current collection
        qdrant_client.delete_collection(collection_name=collection_name)
        print(f"Deleted existing collection '{collection_name}'.")
    else:
        print(f"Collection '{collection_name}' does not exist. Creating new collection.")

    # Define vectors configuration for the new collection
    vectors_config = models.VectorParams(size=1536, distance=models.Distance.COSINE)

    # Create the new collection
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=vectors_config
    )
    print(f"Created new collection '{collection_name}'.")

    # Split embeddings data into batches and upsert each batch
    for i, batch in enumerate(split_into_batches(embeddings_data, BATCH_SIZE)):
        try:
            qdrant_client.upsert(points=batch, collection_name=collection_name)
            print(f"Uploaded batch {i + 1} ({len(batch)} embeddings) to Qdrant in the '{collection_name}' collection.")
        except Exception as e:
            print(f"An error occurred while uploading batch {i + 1}: {e}")
            if hasattr(e, 'response'):
                print("Error response:", e.response.text)

    print(f"Finished uploading all batches to Qdrant in the '{collection_name}' collection.")

except Exception as e:
    print(f"An error occurred: {e}")
    if hasattr(e, 'response'):
        print("Error response:", e.response.text)
