In [1]:
import pandas as pd
import json
import requests
from concurrent.futures import ThreadPoolExecutor
import math

username = "admin"  # Replace with your CouchDB username
password = "admin"  # Replace with your CouchDB password
couchdb_url = "http://127.0.0.1:5984/"

def handle_nan(value, default='Unknown'):
    if pd.isna(value) or value is None or (isinstance(value, float) and math.isnan(value)):
        return default
    return value

def upload_group_to_couchdb(groups, batch_size=50000):
    # Prepare the payload for bulk upload
    payload = {"docs": []}

    for group in groups:
        payload["docs"].append(group)
        
        # print("group:")
        # print("      ", group)

        # print("groups:")
        # print("      ", groups)

        # If we reach the batch size, send the current batch
        if len(payload["docs"]) == batch_size:
            response = requests.post(couchdb_url + "groups/_bulk_docs", json=payload, auth=(username, password))
            if response.status_code == 201:
                print(f"Successfully added batch of {batch_size} groups to CouchDB.")
            else:
                print(f"Failed to add batch to CouchDB: {response.text}")
            
            # Reset the payload for the next batch
            payload["docs"] = []

    # Check for any remaining groups that didn't fill a complete batch
    if payload["docs"]:
        response = requests.post(couchdb_url + "groups/_bulk_docs", json=payload, auth=(username, password))
        if response.status_code == 201:
            print(f"Successfully added remaining {len(payload['docs'])} groups to CouchDB.")
        else:
            print(f"Failed to add remaining groups to CouchDB: {response.text}")

# Function to upload groups with embedded company details
def upload_groups(group_file, company_file):
    groups_data = pd.read_csv(group_file)
    companies_data = pd.read_csv(company_file)

    companies_dict = companies_data.set_index('id').to_dict(orient='index')

    groups_json = []  # List to hold the JSON data

    for _, group in groups_data.iterrows():
        group_id = int(group['id'])

        group_data = group.to_dict()

        if pd.notna(group['company_id']):
            group_data['company'] = {
                '_id': str(int(group['company_id']))
            }

        else:
            group_data['company'] = {}

        group_data.pop('company_id', None)

        if 'debut' in group_data:
            group_data['debut_date'] = group_data.pop('debut')

        group_entry = {
            '_id': str(group_data['id']),
            'name': handle_nan(group_data['name']),
            'debut_date': handle_nan(group_data['debut_date']),
            'company': handle_nan(group_data['company']),
            'fanclub_name': handle_nan(group_data['fanclub_name']),
            'active': handle_nan(group_data['active']),
            'type': handle_nan(group_data['type'])
        }
        
        groups_json.append(group_entry)
            
    upload_group_to_couchdb(groups_json)
    # # Use ThreadPoolExecutor to parallelize the uploads
    # with ThreadPoolExecutor(max_workers=15) as executor:  # Adjust max_workers as needed
    #     executor.map(upload_group_to_couchdb, groups_json)

# Upload Groups with embedded company details
upload_groups("groups.csv", "companies.csv")

  groups_data = pd.read_csv(group_file)


Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added batch of 50000 groups to CouchDB.
Successfully added remaining 41053 groups to CouchDB.
