In [None]:
from huggingface_hub import snapshot_download

repo_id = 'allenai/pixmo-cap'
target_dir = '/lambdafs/jacob/data/' + repo_id.replace('/', '_')

snapshot_download(repo_id=repo_id, local_dir=target_dir, repo_type='dataset')


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from DataCentral.data_manager import DataManager

In [9]:
# Function to generate sample data entries
def generate_data_entries(N):
    import random
    import json

    data_sources = ['source1', 'source2', 'source3']
    group_ids = ['group1', 'group2', 'group3']
    data_types = ['type1', 'type2', 'type3']
    files_list = [['file1.txt', 'file2.txt'], ['file3.txt'], ['file4.txt', 'file5.txt', 'file6.txt']]

    data_entries = []
    for _ in range(N):
        data_source = random.choice(data_sources)
        group_id = random.choice(group_ids)
        data_type = random.choice(data_types)
        data = json.dumps({'value': random.randint(1, 1000000)})
        files = random.choice(files_list)
        entry = {
            'data_source': data_source,
            'group_id': group_id,
            'data_type': data_type,
            'data': data,
            'files': files
        }
        data_entries.append(entry)
    return data_entries

# Initialize the DataManager
data_manager = DataManager('my_database.db')

# Generate and insert data entries
N = 1000  # Adjust N to the desired number of entries, e.g., 10000000 for 10 million
print(f"Generating {N} data entries...")
data_entries = generate_data_entries(N)

print("Inserting data entries using batch_insert_data...")
data_manager.insert_data(data_entries)
print("Data insertion complete.")

# Define an index function
def index_by_data_type(row):
    return [row['data_type']]

# Create an index
print("Creating index on data_type...")
data_manager.create_index('data_type_index', index_by_data_type)
print("Index creation complete.")

# Get rows by index value
print("Getting rows with data_type 'type1'...")
rows_with_type1 = data_manager.get_data(data_sources=["source2"], group_ids=["group2"], limit=10)
print(f"Number of rows with data_type 'type1': {len(rows_with_type1)}")

# Get index values for a specific row
some_uuid = rows_with_type1[0]['uuid']
index_values = data_manager.get_index_values_for_row('data_type_index', some_uuid)
print(f"Index values for UUID {some_uuid}: {index_values}")

Generating 1000 data entries...
Inserting data entries using batch_insert_data...
Data insertion complete.
Creating index on data_type...
Index creation complete.
Getting rows with data_type 'type1'...
Number of rows with data_type 'type1': 10
Index values for UUID f77af353-c3a6-4113-a9b1-fb1070ba7c4b: ['type1']


In [None]:
# Get data by data source and group ID
print("Getting data from source1 and group1...")
data = data_manager.get_data(data_sources=['source1'], group_ids=['group1'], limit=10, offset=1000000)
print(f"Number of rows from source1 and group1: {len(data)}")

# Delete a group
print("Deleting group 'group1'...")
data_manager.delete_group('group1')
print("Group 'group1' deleted.")

# Delete an index
print("Deleting index 'data_type_index'...")
data_manager.delete_index('data_type_index')
print("Index 'data_type_index' deleted.")

Number of rows from source1 and group1: 0
Group 'group1' deleted.
Index 'data_type_index' deleted.


No records found for group ID 'group3'


0