<a href="https://colab.research.google.com/github/siherm/PUMA/blob/main/PUMA_Duplicate_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydantic



In [11]:
import dictdiffer
import os
import pandas as pd
import re
import requests

from itertools import compress
from pydantic import BaseModel, Field, PrivateAttr, validator
from typing import Optional, List, Union

from pyPUMA import PUMACollection, PUMAEntry

## Configuration

In [12]:
# 1 - Fetch data from PUMA (Replace if necessary or add to environment vars)
username = os.environ["PUMA_USERNAME"]
token = os.environ["PUMA_TOKEN"]

## Utilizing the PUMACollection
### Application: Find duplicates

In [4]:
# Fetch remote collection from Simtech group
collection = PUMACollection.from_web_api(username, token)

# Find all duplicates and sort into data structure
duplicates = collection.gather_duplicates("doi")

In [14]:
# Get an example
record = duplicates["10.1007/s10494-016-9704-y"]
one, two = record[0], record[1]

# Get difference
diff = dictdiffer.diff(one.dict(), two.dict(), ignore=["bibtex"])

# Print changes
for operation, field, changes in diff:
    print(f"{operation} in {field}: {changes}")

change in author: ('Beck, Andrea D. and Flad, David G. and Tonh{\\"a}user, Claudia and Gassner, Gregor and Munz, Claus-Dieter', 'Beck, A. and Flad, D. and Tonhäuser, C. and Gassner, G. and Munz, C.-D.')
change in ['tag', 1]: ('from:katharinafuchs', 'from:simtechpuma')
change in ['tag', 2]: ('from:simtechpuma', 'imported')
remove in tag: [(3, 'pn1')]
change in journal: ('Flow, Turbulence and Combustion', 'Flow Turbulence Combustion')
change in url: ('https://doi.org/10.1007/s10494-016-9704-y', 'http://doi.org/10.1007/s10494-016-9704-y')
change in issn: ('1573-1987', None)


In [7]:
# Use the PUMAEntry method to merge 
PUMAEntry.from_multiple_entries(entries=[one, two])

PUMAEntry(title='On the Influence of Polynomial De-aliasing on Subgrid Scale Models', user='simtech', author='Beck, Andrea D. and Flad, David G. and Tonh{\\"a}user, Claudia and Gassner, Gregor and Munz, Claus-Dieter', entrytype='article', group='SimTech', tag=['from:katharinafuchs', 'EXC310', 'from:simtechpuma', 'imported', 'pn1'], journal='Flow, Turbulence and Combustion', preprint_id=None, url='https://doi.org/10.1007/s10494-016-9704-y', doi='10.1007/s10494-016-9704-y', isbn=None, issn='1573-1987', misc=None)

In [16]:
print(PUMAEntry.from_multiple_entries.__doc__)

Merges multiple entries to a single one

        This method is used to merge multiple entries to a single one in the
        duplicate analysis given within PyPUMA. It should be noted, that the
        static fields from the first entry will be preferred if not specified
        in the recursion dictionary. Fields that are lists will be added and
        duplicate values merged to a single.

        In order to apply preferences, a recursion dictionary can be given
        from which the fields value that matches will be used. For instance,
        if the entry_type attribute is mixed but specified as 'article' the
        latter will be used for the new entry. If no values apply to the
        recursion, the first will be used.


        Args:
          entries (list[PUMAEntry]): List of PUMAEntries that will be merged
          recursion (dict): Prefrences that will be used to merge.
        
