In [None]:
import gzip
import os
import pandas as pd
import requests
from cellxgene_mods import CxG_API=


CxG_API.config()

In [None]:
def group_fragments(filename, chunk_size=10000000):
    print(f'reading {filename}')
    results = {}
    
    # Read file in chunks
    with gzip.open(filename, 'rt') as f:
        for chunk in pd.read_csv(f, sep='\t', chunksize=chunk_size,
                                names=['chrom', 'start', 'end', 'barcode', 'readSupport'],
                                usecols=['chrom','barcode','readSupport']):
            # Group by chrom and barcode for this chunk
            chunk_grouped = chunk.groupby(['chrom', 'barcode']).agg({
                'readSupport': ['count', 'sum']
            })
            
            # Flatten column names
            chunk_grouped.columns = ['row_count', 'readSupport_sum']
            
            # Merge with existing results
            for (chrom, barcode), row in chunk_grouped.iterrows():
                key = (chrom, barcode)
                if key in results:
                    results[key]['row_count'] += row['row_count']
                    results[key]['readSupport_sum'] += row['readSupport_sum']
                else:
                    results[key] = {
                        'row_count': row['row_count'],
                        'readSupport_sum': row['readSupport_sum']
                    }

    return results


def compare_dicts(up, down):
    print('comparing summaries\n')
    only_in_up = list(up.keys() - down.keys())
    if only_in_up:
        print(f'{len(only_in_up)} only in uploaded:{only_in_up[:5]}')

    only_in_down = list(down.keys() - up.keys())
    if only_in_down:
        print(f'{len(only_in_down)} only in downloaded:{only_in_down[:5]}')

    common_keys = up.keys() & down.keys()
    for k in common_keys:
        for f in ['row_count','readSupport_sum']:
            if up[k][f] != down[k][f]:
                print(f'inconsistent {f}:{k}, uploaded:{up[k][f]}, downloaded:{down[k][f]}')


def compare_frag_files(up_file,down_file):
    up = group_fragments(up_file)
    down = group_fragments(down_file)

    compare_dicts(up, down)


def download_cxg_fragments(collection_id,dataset_id):
    my_dataset = CxG_API.get_dataset(collection_id,dataset_id)
    for a in my_dataset['assets']:
        if a['filetype'] == 'ATAC_FRAGMENT':
            down_file = my_dataset['dataset_id'] + '.tsv.bgz'
            print(down_file)
            if os.path.exists(down_file) == False:
                print('Downloading')
                with requests.get(a['url'], stream=True) as res:
                    res.raise_for_status()
                    with open(down_file, "wb") as df:
                        for chunk in res.iter_content(chunk_size=1024 * 1024):
                            df.write(chunk)
                print('Done')
            else:
                print('already Downloaded')
            return down_file

In [None]:
# define the uploaded file
up_file = ''

# define the dataset to compare with
collection = ''
dataset = ''

In [None]:
down_file = download_cxg_fragments(collection, dataset)
compare_frag_files(up_file,down_file)