In [15]:
import os
import shutil

def clear_folder(folder_path):
    for item in os.listdir(folder_path):
        item_path = os.path.join(folder_path, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        except Exception as e:
            print(f"Failed to delete {item_path}. Reason: {e}")

# usage
folder_to_clear = "../data/output_texts"
clear_folder(folder_to_clear)
print(f"Cleared folder: {folder_to_clear}")

Cleared folder: ../data/output_texts


In [16]:
# Batching the CSV content into text files

import pandas as pd
import os

COLUMN_NAME = 'Tweet'

def read_csv_with_fallback_encodings(file_path, encodings=["utf-8", "cp1252", "latin1"]):
    for enc in encodings:
        try:
            df = pd.read_csv(file_path, encoding=enc)
            print(f"✅ Loaded CSV using encoding: {enc}")
            return df
        except UnicodeDecodeError:
            print(f"⚠️ Failed with encoding: {enc}")
    raise UnicodeDecodeError("All tried encodings failed.")

def save_csv_to_txt(csv_file, output_folder, column_name=COLUMN_NAME):
    # Try to read the CSV with multiple encodings
    df = read_csv_with_fallback_encodings(csv_file)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through each record in the 'Content' column
    for idx, row in df.iterrows():
        content = str(row.get(column_name, ''))  # Use .get in case the column doesn't exist
        file_path = os.path.join(output_folder, f'record_{idx + 1}.txt')

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    print(f"\n🎉 Saved {len(df)} records to '{output_folder}'")

# Example usage
csv_file = "../data/csv/tweets_3.csv"
output_folder = "../data/output_texts/"
save_csv_to_txt(csv_file, output_folder, COLUMN_NAME)


✅ Loaded CSV using encoding: utf-8

🎉 Saved 1188 records to '../data/output_texts/'


In [17]:
# Uploading the text files to FastAPI (SQLAlchemy) database

import os
import requests

# Path to the folder containing text files
FOLDER_PATH = r"../data/output_texts"
API_URL = "http://127.0.0.1:5000/api/v1/data/upload"
PROJECT_ID = "12"  # Change this to the correct project ID

def upload_file(file_path):
    """Uploads a single file to the FastAPI database."""
    file_name = os.path.basename(file_path)
    files = {
        "file": (file_name, open(file_path, "rb"), "text/plain")  # Force correct content-type
    }
    url = f"{API_URL}/{PROJECT_ID}"

    response = requests.post(url, files=files)

    if response.status_code == 200:
        print(f"✅ Successfully uploaded: {file_name}")
    else:
        print(f"❌ Failed to upload {file_name}. Response: {response.text}")

def upload_all_files():
    """Uploads all .txt files from the specified folder."""
    if not os.path.exists(FOLDER_PATH):
        print(f"❌ Folder not found: {FOLDER_PATH}")
        return

    txt_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(".txt")]

    if not txt_files:
        print("❌ No .txt files found in the folder.")
        return

    print(f"📂 Found {len(txt_files)} files. Uploading...")

    for file in txt_files:
        file_path = os.path.join(FOLDER_PATH, file)
        upload_file(file_path)

    print("🎉 All files uploaded successfully!")

if __name__ == "__main__":
    upload_all_files()


📂 Found 1188 files. Uploading...
✅ Successfully uploaded: record_1.txt
✅ Successfully uploaded: record_10.txt
✅ Successfully uploaded: record_100.txt
✅ Successfully uploaded: record_1000.txt
✅ Successfully uploaded: record_1001.txt
✅ Successfully uploaded: record_1002.txt
✅ Successfully uploaded: record_1003.txt
✅ Successfully uploaded: record_1004.txt
✅ Successfully uploaded: record_1005.txt
✅ Successfully uploaded: record_1006.txt
✅ Successfully uploaded: record_1007.txt
✅ Successfully uploaded: record_1008.txt
✅ Successfully uploaded: record_1009.txt
✅ Successfully uploaded: record_101.txt
✅ Successfully uploaded: record_1010.txt
✅ Successfully uploaded: record_1011.txt
✅ Successfully uploaded: record_1012.txt
✅ Successfully uploaded: record_1013.txt
✅ Successfully uploaded: record_1014.txt
✅ Successfully uploaded: record_1015.txt
✅ Successfully uploaded: record_1016.txt
✅ Successfully uploaded: record_1017.txt
✅ Successfully uploaded: record_1018.txt
✅ Successfully uploaded: record

In [18]:
# Chunking the text files and preparing for Embedding

import requests

# Set FastAPI Base URL
API_URL = "http://127.0.0.1:5000/api/v1/data/process"
PROJECT_ID = "12"  # Change this to the actual project ID

# Processing Parameters
CHUNK_SIZE = 15000  
OVERLAP_SIZE = 0 
DO_RESET = 1  # Set to 1 to remove old chunks before processing

def process_uploaded_files():
    """Triggers processing of all uploaded files for a given project."""
    url = f"{API_URL}/{PROJECT_ID}"

    payload = {
        "chunk_size": CHUNK_SIZE,
        "overlap_size": OVERLAP_SIZE,
        "do_reset": DO_RESET
    }

    print(f"📂 Processing files for Project ID: {PROJECT_ID} with overlap {OVERLAP_SIZE} bytes...")
    response = requests.post(url, json=payload)

    if response.status_code == 200:
        print("✅ Successfully processed all files!")
        print(response.json())
    else:
        print(f"❌ Processing failed: {response.status_code}")
        print(response.text)

if __name__ == "__main__":
    process_uploaded_files()


📂 Processing files for Project ID: 12 with overlap 0 bytes...
✅ Successfully processed all files!
{'signal': 'processing_success', 'inserted_chunks': 1188, 'processed_files': 1188}


In [19]:
# Pushing the Documents into Vector Database

import requests

PROJECT_ID = 12
ENDPOINT_URL = f"http://127.0.0.1:5000/api/v1/nlp/index/push/{PROJECT_ID}"

def push_index():
    """Triggers the push index operation for the specified project."""
    # Send a dummy or empty JSON payload if body is required
    response = requests.post(ENDPOINT_URL, json={})

    if response.status_code == 200:
        print("✅ Successfully pushed the index!")
        print(response.json())
    else:
        print(f"❌ Failed to push the index: {response.status_code}")
        print(response.text)


push_index()


✅ Successfully pushed the index!
{'signal': 'insert_into_vectordb_success', 'inserted_items_count': 1188}
