# Collect Matches into Flat Files
This notebook collects all of the matches for a certain subset of molecules then writes them to a flat file.

In [1]:
from pymongo import MongoClient
from csv import DictWriter
from pathlib import Path
from tqdm import tqdm
import gzip

In [2]:
subset = 'known-pareto'
output_dir = Path('mentions')

Derived

In [3]:
output_dir.mkdir(exist_ok=True)
output_file = output_dir / f'{subset}-mentions.csv.gz'

## Connect to the Database
We need access to both the molecule record and mention databases

In [4]:
db = MongoClient(port=27894)['cfree']

In [5]:
molecules = db['molecule_record']
print(f'Our database has {molecules.estimated_document_count()} molecules')

Our database has 111995381 molecules


We also need the mentions

In [6]:
mentions = db['mention']
print(f'Our database has {mentions.estimated_document_count()} mentions')

Our database has 97841130 mentions


## Get the IDs of the Molecules that Match
We are going to query all of the IDs to start with

In [7]:
%%time
subset_keys = [x['_id'] for x in molecules.find({'subsets': subset}, projection=['_id'])]
print(f'Total of {len(subset_keys)} molecules in our subset')

Total of 8 molecules in our subset
CPU times: user 1.21 ms, sys: 2.03 ms, total: 3.24 ms
Wall time: 261 ms


## Write Mentions to Disk
Get all of the mentions and write them to disk

In [8]:
total_matches = 0
matched_keys = set()
with gzip.open(output_file, 'wt') as fp:
    # Start writing
    writer = DictWriter(fp, ['filename', 'line', 'name', 'molecule', 'text'])
    writer.writeheader()
    
    # Gather all mentions
    for mention in tqdm(mentions.find({'matches.key': {'$in': subset_keys}})):
        to_write = []
        for match in mention['matches']: 
            if match['key'] in subset_keys:
                matched_keys.add(match['key'])
                total_matches += 1
                to_write.append({
                    'filename': mention['filename'],
                    'line': mention['line'],
                    'text': mention['text'][:-1],
                    'molecule': match['key'],
                    'name': match['name']
                })
        writer.writerows(to_write)

1190450it [05:23, 3683.00it/s] 


In [9]:
print(f'Found mention of {len(matched_keys)}/{len(subset_keys)} molecules')

Found mention of 8/8 molecules


In [10]:
print(f'Printed a total of {total_matches} matches on {len(matched_keys)} molecules. Average of {total_matches//len(matched_keys):.1e} mention per molecule')

Printed a total of 1198768 matches on 8 molecules. Average of 1.5e+05 mention per molecule
