# Duplicate outputs

Some outputs have been recorded multiple times; sometimes within the same SYNTH round and sometimes in separate rounds. This notebook briefly analyses these duplicates.

NB: if re-running this, the Outputs table should be _filled_ but not _cleaned_.

## Loading

In [1]:
import itertools
import json
from collections import Counter

import sqlitedict
import yaml
from sqlalchemy.orm import sessionmaker

from synth.model.analysis import Output
from synth.utils import Config, Context, clean_string

Create a connection to the MySQL database:

In [2]:
with open('../config.yml', 'r') as f:
    config = Config(**yaml.safe_load(f))

context = Context(config)
db = sessionmaker(bind=context.target_engine)()

Load the DOI and metadata resources:

In [3]:
metadata = sqlitedict.SqliteDict('../synth/data/doi_metadata.db')
cache = sqlitedict.SqliteDict('../synth/data/output_dois.db')

Load a JSON dump of "old" (i.e. in the separate SYNTH NHMOutputs tables) IDs mapped to "new" (i.e. in the combined Outputs table) IDs. _This should be recreated if the table is regenerated._

In [4]:
with open('mappings.json', 'r') as f:
    mappings = json.load(f)
    mappings = {tuple(json.loads(k)): v for k, v in mappings.items()}

## Processing

In [5]:
by_doi = {k: [tuple(json.loads(x[0])) for x in v] for k, v in
          itertools.groupby(sorted(cache.items(), key=lambda x: x[1]), key=lambda x: x[1])}
duplicated_dois = {k: v for k, v in by_doi.items() if len(v) > 1}
duplicated_records = set([item for k, v in duplicated_dois.items() for item in v])

In [6]:
len(duplicated_dois)

426

In [7]:
len(duplicated_records)

936

In [8]:
duplicates = []
for k, v in duplicated_dois.items():
    titles = set()
    rounds = set()
    for output_key in v:
        output_id = mappings.get(output_key)
        o = db.query(Output).get(output_id)
        output_title = o.title or ''
        output_title = clean_string(output_title)
        titles.add(output_title)
        rounds.add(output_key[0])
    duplicates.append((len(v), len(rounds), len(titles)))

duplicates = {k: Counter([f'{x[1]} rounds; {x[2]} titles' for x in v]) for k, v in itertools.groupby(sorted(duplicates), key=lambda x: x[0])}

for k, v in duplicates.items():
    print(f'{k} records:')
    for vk, vv in v.items():
        print(f'\t{vk} x {vv}')


2 records:
	1 rounds; 1 titles x 62
	1 rounds; 2 titles x 114
	2 rounds; 1 titles x 96
	2 rounds; 2 titles x 89
3 records:
	1 rounds; 1 titles x 2
	1 rounds; 2 titles x 9
	1 rounds; 3 titles x 10
	2 rounds; 1 titles x 2
	2 rounds; 2 titles x 11
	2 rounds; 3 titles x 5
	3 rounds; 1 titles x 6
	3 rounds; 2 titles x 2
	3 rounds; 3 titles x 5
4 records:
	1 rounds; 1 titles x 1
	1 rounds; 4 titles x 1
	2 rounds; 2 titles x 3
	2 rounds; 3 titles x 1
	2 rounds; 4 titles x 1
	3 rounds; 2 titles x 1
	3 rounds; 3 titles x 1
5 records:
	1 rounds; 4 titles x 1
	2 rounds; 2 titles x 1
6 records:
	2 rounds; 2 titles x 1
	3 rounds; 6 titles x 1
