In [5]:
import pandas as pd
import random

def save_random_articles(csv_file, output_csv, sample_size=40):
    try:
        df = pd.read_csv(csv_file, encoding='utf-8')
    except UnicodeDecodeError:
        print("UTF-8 decoding failed. Trying with cp1252 encoding...")
        df = pd.read_csv(csv_file, encoding='cp1252')

    # Check if sample_size is greater than available rows
    sample_size = min(sample_size, len(df))
    
    # Randomly select sample_size rows
    sampled_df = df.sample(n=sample_size, random_state=42)  # fixed seed for reproducibility
    
    # Save the sampled DataFrame to a new CSV
    sampled_df.to_csv(output_csv, index=False, encoding='utf-8')
    
    print(f"Saved {sample_size} random articles to '{output_csv}'")

# Example usage
csv_file = "data/final_dis.csv"
output_csv = "data/random_40_articles.csv"
save_random_articles(csv_file, output_csv)


UTF-8 decoding failed. Trying with cp1252 encoding...
Saved 40 random articles to 'data/random_40_articles.csv'


In [1]:
import pandas as pd
import os

def read_csv_with_fallback_encodings(file_path, encodings=["utf-8", "cp1252", "latin1"]):
    for enc in encodings:
        try:
            df = pd.read_csv(file_path, encoding=enc)
            print(f"✅ Loaded CSV using encoding: {enc}")
            return df
        except UnicodeDecodeError:
            print(f"⚠️ Failed with encoding: {enc}")
    raise UnicodeDecodeError("All tried encodings failed.")

def save_csv_to_txt(csv_file, output_folder):
    # Try to read the CSV with multiple encodings
    df = read_csv_with_fallback_encodings(csv_file)

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through each record in the 'Content' column
    for idx, row in df.iterrows():
        content = str(row.get('Translated_Tweet', ''))  # Use .get in case the column doesn't exist
        file_path = os.path.join(output_folder, f'record_{idx + 1}.txt')

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    print(f"\n🎉 Saved {len(df)} records to '{output_folder}'")

# Example usage
csv_file = "data/cleaned_tweets.csv"
output_folder = "data/output_texts/"
save_csv_to_txt(csv_file, output_folder)


✅ Loaded CSV using encoding: utf-8

🎉 Saved 732 records to 'data/output_texts/'


In [2]:
import os
import requests

# Path to the folder containing text files
FOLDER_PATH = r"data/output_texts"
API_URL = "http://127.0.0.1:5000/api/v1/data/upload"
PROJECT_ID = "5"  # Change this to the correct project ID

def upload_file(file_path):
    """Uploads a single file to the FastAPI database."""
    file_name = os.path.basename(file_path)
    files = {
        "file": (file_name, open(file_path, "rb"), "text/plain")  # Force correct content-type
    }
    url = f"{API_URL}/{PROJECT_ID}"

    response = requests.post(url, files=files)

    if response.status_code == 200:
        print(f"✅ Successfully uploaded: {file_name}")
    else:
        print(f"❌ Failed to upload {file_name}. Response: {response.text}")

def upload_all_files():
    """Uploads all .txt files from the specified folder."""
    if not os.path.exists(FOLDER_PATH):
        print(f"❌ Folder not found: {FOLDER_PATH}")
        return

    txt_files = [f for f in os.listdir(FOLDER_PATH) if f.endswith(".txt")]

    if not txt_files:
        print("❌ No .txt files found in the folder.")
        return

    print(f"📂 Found {len(txt_files)} files. Uploading...")

    for file in txt_files:
        file_path = os.path.join(FOLDER_PATH, file)
        upload_file(file_path)

    print("🎉 All files uploaded successfully!")

if __name__ == "__main__":
    upload_all_files()


📂 Found 732 files. Uploading...
✅ Successfully uploaded: record_1.txt
✅ Successfully uploaded: record_10.txt
✅ Successfully uploaded: record_100.txt
✅ Successfully uploaded: record_101.txt
✅ Successfully uploaded: record_102.txt
✅ Successfully uploaded: record_103.txt
✅ Successfully uploaded: record_104.txt
✅ Successfully uploaded: record_105.txt
✅ Successfully uploaded: record_106.txt
✅ Successfully uploaded: record_107.txt
✅ Successfully uploaded: record_108.txt
✅ Successfully uploaded: record_109.txt
✅ Successfully uploaded: record_11.txt
✅ Successfully uploaded: record_110.txt
✅ Successfully uploaded: record_111.txt
✅ Successfully uploaded: record_112.txt
✅ Successfully uploaded: record_113.txt
✅ Successfully uploaded: record_114.txt
✅ Successfully uploaded: record_115.txt
✅ Successfully uploaded: record_116.txt
✅ Successfully uploaded: record_117.txt
✅ Successfully uploaded: record_118.txt
✅ Successfully uploaded: record_119.txt
✅ Successfully uploaded: record_12.txt
✅ Successfull

In [3]:
import requests

# Set FastAPI Base URL
API_URL = "http://127.0.0.1:5000/api/v1/data/process"
PROJECT_ID = "5"  # Change this to your actual project ID

# Processing Parameters
CHUNK_SIZE = 15000  # Adjust based on your needs
OVERLAP_SIZE = 0  # Adjust the overlap size (e.g., 1000 bytes)
DO_RESET = 1  # Set to 1 to remove old chunks before processing

def process_uploaded_files():
    """Triggers processing of all uploaded files for a given project."""
    url = f"{API_URL}/{PROJECT_ID}"

    payload = {
        "chunk_size": CHUNK_SIZE,
        "overlap_size": OVERLAP_SIZE,
        "do_reset": DO_RESET
    }

    print(f"📂 Processing files for Project ID: {PROJECT_ID} with overlap {OVERLAP_SIZE} bytes...")
    response = requests.post(url, json=payload)

    if response.status_code == 200:
        print("✅ Successfully processed all files!")
        print(response.json())
    else:
        print(f"❌ Processing failed: {response.status_code}")
        print(response.text)

if __name__ == "__main__":
    process_uploaded_files()


📂 Processing files for Project ID: 5 with overlap 0 bytes...
✅ Successfully processed all files!
{'signal': 'processing_success', 'inserted_chunks': 732, 'processed_files': 732}
