<a href="https://colab.research.google.com/github/siherm/PUMA/blob/main/PUMA_Duplicate_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pydantic



In [114]:
import dictdiffer
import os

from pyPUMA import PUMACollection, PUMAEntry

## Configuration

In [115]:
# 1 - Fetch data from PUMA (Replace if necessary or add to environment vars)
username = os.environ["PUMA_USERNAME"]
token = os.environ["PUMA_TOKEN"]

## Utilizing the PUMACollection
### Application: Find duplicates

In [116]:
# Fetch remote collection from Simtech group
collection = PUMACollection.from_web_api(username, token)

# Find all duplicates and sort into data structure
duplicates = collection.gather_duplicates("doi")

In [117]:
# Get an example
record = duplicates["10.1007/s10494-016-9704-y"]

In [118]:
for tup in dictdiffer.diff(record[0].dict(exclude={"bibtex"}), record[1].dict(exclude={"bibtex"})):
    print(tup)

('change', 'author', ('Beck, Andrea D. and Flad, David G. and Tonh{\\"a}user, Claudia and Gassner, Gregor and Munz, Claus-Dieter', 'Beck, A. and Flad, D. and Tonhäuser, C. and Gassner, G. and Munz, C.-D.'))
('change', ['tag', 1], ('from:katharinafuchs', 'from:simtechpuma'))
('change', ['tag', 2], ('from:simtechpuma', 'imported'))
('remove', 'tag', [(3, 'pn1')])
('change', 'journal', ('Flow, Turbulence and Combustion', 'Flow Turbulence Combustion'))
('change', 'url', ('https://doi.org/10.1007/s10494-016-9704-y', 'http://doi.org/10.1007/s10494-016-9704-y'))
('change', 'issn', ('1573-1987', None))


In [122]:
authors = record[0].author.split(" and ")

In [123]:
for author in authors:
    surname, name = author.split(",")
    
    print(name, surname)

 Andrea D. Beck
 David G. Flad
 Claudia Tonh{\"a}user
 Gregor Gassner
 Claus-Dieter Munz


In [24]:
# Use the PUMAEntry method to merge 
print(PUMAEntry.from_multiple_entries(entries=[record[0], record[1]]).json(indent=2))

{
  "title": "On the Influence of Polynomial De-aliasing on Subgrid Scale Models",
  "user": "simtech",
  "author": "Beck, Andrea D. and Flad, David G. and Tonh{\\\"a}user, Claudia and Gassner, Gregor and Munz, Claus-Dieter",
  "entrytype": "article",
  "group": "SimTech",
  "tag": [
    "imported",
    "pn1",
    "from:simtechpuma",
    "EXC310",
    "from:katharinafuchs"
  ],
  "journal": "Flow, Turbulence and Combustion",
  "preprint_id": null,
  "url": "https://doi.org/10.1007/s10494-016-9704-y",
  "doi": "10.1007/s10494-016-9704-y",
  "isbn": null,
  "issn": "1573-1987",
  "misc": null
}


In [16]:
print(PUMAEntry.from_multiple_entries.__doc__)

Merges multiple entries to a single one

        This method is used to merge multiple entries to a single one in the
        duplicate analysis given within PyPUMA. It should be noted, that the
        static fields from the first entry will be preferred if not specified
        in the recursion dictionary. Fields that are lists will be added and
        duplicate values merged to a single.

        In order to apply preferences, a recursion dictionary can be given
        from which the fields value that matches will be used. For instance,
        if the entry_type attribute is mixed but specified as 'article' the
        latter will be used for the new entry. If no values apply to the
        recursion, the first will be used.


        Args:
          entries (list[PUMAEntry]): List of PUMAEntries that will be merged
          recursion (dict): Prefrences that will be used to merge.
        


In [95]:
import re

In [100]:
def filter_regex(result: list[tuple]):
    return [
        tup for tup in result
        if list(filter(None, tup))
    ]

In [105]:
regex = r"[pn|PN](\d*)(-\w?)?"
regex = re.compile(regex)

In [112]:
string = "pn<7>"
string = re.sub(r">|<", "", string)

'pn7'

In [107]:
filter_regex(regex.findall(string))

[('3', '-8')]

In [58]:
tags = []
for record in collection:
    for tag in record.tag:
        if "pn" in tag.lower():
            tags.append(tag.lower())
    

In [59]:
set(tags)

{'pn1',
 'pn1-2b',
 'pn2',
 'pn2-6',
 'pn3',
 'pn3-6',
 'pn3-8send:unibiblio',
 'pn4',
 'pn5',
 'pn6',
 'pn6-1',
 'pn7',
 'pn<7>',
 'pn_missing'}

In [54]:
collection.records[0].__dict__.keys()

dict_keys(['bibtex', 'title', 'user', 'author', 'entrytype', 'group', 'tag', 'journal', 'preprint_id', 'url', 'doi', 'isbn', 'issn', 'misc'])

In [92]:
bool(("", ""))

True