In [1]:
# load and inspect results
import pandas as pd
import matplotlib as plt

latest_results = pd.read_csv("/home/matnic/Projects/citationNotebook/Results/v3/latest_results.csv")
len(latest_results['data_publication_year'])

12882

In [8]:
latest_results.columns

Index(['data_doi', 'data_publisher', 'data_title', 'data_publication_year',
       'data_authors', 'relation_type_id', 'publication_doi',
       'publication_title', 'publication_date', 'publication_authors',
       'citation_event_source', 'pub_publisher', 'publication_type',
       'PubCitationStr', 'data_doi_url', 'publication_doi_url',
       'publicationYear', 'date_added'],
      dtype='object')

In [2]:
# how many rows and how many "Info not given" per publisher
summary = (
    latest_results
    .groupby("data_publisher")
    .agg(
        total_rows=("data_doi", "count"),
        missing_pub_title=("publication_title", lambda x: (x == "Info not given").sum()),
        missing_pub_date=("publication_date", lambda x: (x == "Info not given").sum()),
        missing_pub_authors=("publication_authors", lambda x: (x == "Info not given").sum()),
    )
)

# Add percentages to make it easy to interpret
for col in summary.columns[1:]:
    summary[f"{col}_pct"] = summary[col] / summary["total_rows"] * 100

summary.sort_values("total_rows", ascending=False)


Unnamed: 0_level_0,total_rows,missing_pub_title,missing_pub_date,missing_pub_authors,missing_pub_title_pct,missing_pub_date_pct,missing_pub_authors_pct
data_publisher,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Environmental Information Data Centre (EIDC),4938,68,165,69,1.377076,3.341434,1.397327
Polar Data Centre (PDC),3623,859,883,859,23.709633,24.372067,23.709633
Centre for Environmental Data Analysis (CEDA),3298,0,0,0,0.0,0.0,0.0
British Oceanographic Data Centre (BODC),697,0,0,0,0.0,0.0,0.0
National Geoscience Data Centre (NGDC),286,0,0,0,0.0,0.0,0.0


In [None]:
# see which publishers have have no pub info
cols_to_check = ["publication_title", "publication_date", "publication_authors"]
latest_results["any_missing"] = latest_results[cols_to_check].apply(
    lambda row: any(v == "Info not given" for v in row), axis=1
)
publisher_missing = latest_results.groupby("pub_publisher")["any_missing"].mean() * 100
print(publisher_missing.sort_values(ascending=False))


In [None]:
# check BODC publication year is ok
bodc = latest_results[latest_results['data_publisher'] == "British Oceanographic Data Centre (BODC)"]
bodc["publicationYear"].value_counts()

In [None]:
# check no gbif downloads are getting in
gbif = latest_results[latest_results['publication_doi'].str.contains("10.15468")]
gbif

Unnamed: 0,data_doi,data_publisher,data_title,data_publication_year,data_authors,relation_type_id,publication_doi,publication_title,publication_date,publication_authors,citation_event_source,pub_publisher,publication_type,PubCitationStr,data_doi_url,publication_doi_url,publicationYear,date_added,any_missing


In [3]:
# see most cited dataset dois for each publisher

for dc in latest_results['data_publisher'].unique():

    dc_results = latest_results[latest_results['data_publisher'] == dc]
    print(dc)
    print(dc_results['data_doi'].value_counts()[0:3])

Environmental Information Data Centre (EIDC)
data_doi
10.5285/6c6c9203-7333-4d96-88ab-78925e7a4e73    82
10.5285/33604ea0-c238-4488-813d-0ad9ab7c51ca    64
10.5285/bb15e200-9349-403c-bda9-b430093807c7    49
Name: count, dtype: int64
Polar Data Centre (PDC)
data_doi
10.5285/324137d3-cfc5-4cf6-a360-1a293a3e9ed6    32
10.5285/a89028f1-76d0-41f4-bee0-021633a8b0ed    32
10.5285/2fd95199-365e-4da1-ae26-3b6d48b3e6ac    30
Name: count, dtype: int64
nan
Series([], Name: count, dtype: int64)
Centre for Environmental Data Analysis (CEDA)
data_doi
10.5285/03cc44f98b0e4a4b97df37662e62be79    21
10.5285/cf006675070548359e22e36d354d0f92    21
10.5285/ceae289f1a56414ea708f43db83fc2c6    21
Name: count, dtype: int64
British Oceanographic Data Centre (BODC)
data_doi
10.5285/cd857050-0c6d-3a71-e053-6c86abc08527    12
10.5285/bbaf922a-e46e-7e94-e053-6c86abc0089a    11
10.5285/7a150d33-956b-0fec-e053-6c86abc0b35c    10
Name: count, dtype: int64
National Geoscience Data Centre (NGDC)
data_doi
10.5285/bbbacf

In [35]:
# view citations for a particular data_doi
doi = "10.5285/2d0e4791-8e20-46a3-80e4-f5f6716025d2"
result = latest_results[latest_results['data_doi'] == doi]
print(result['data_title'].iloc[0])
result = result[['relation_type_id', 
                 'publication_doi',
       'publication_title', # 'publication_date', 'publication_authors',
       'citation_event_source', 'pub_publisher', 
       #'publication_type'
       ]]

with pd.option_context('display.max_colwidth', 400):
    display(result)


BEDMAP3 - Ice thickness, bed and surface elevation for Antarctica - gridding products


Unnamed: 0,relation_type_id,publication_doi,publication_title,citation_event_source,pub_publisher
6224,IsRelatedTo,10.7910/dvn/x7ndny,"The Reference Elevation Model of Antarctica - Strips, Version 4.1",scholex,Harvard Dataverse
6225,IsRelatedTo,10.5281/zenodo.3874654,"Radar observations of an active subglacial lake system in the David Glacier catchment, Antarctica",scholex,ZENODO
6226,IsRelatedTo,10.1594/pangaea.972094,Collection of datasets from AWI's radio-echo sounding systems on ice sheets and glaciers,scholex,PANGAEA - Data Publisher for Earth and Environmental Science
6227,IsRelatedTo,10.5067/9ebr2t0vxudg,"IceBridge HiCARS 2 L2 Geolocated Ice Thickness, Version 1",scholex,Unknown Repository
6228,IsRelatedTo,10.15784/601001,"Ice thickness and related data over central Marie Byrd Land, West Antarctica (GIMBLE.GR2HI2)",scholex,Unknown Repository
6229,IsRelatedTo,10.5285/0f90d926-99ce-43c9-b536-0c7791d1728b,"BEDMAP2 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository
6230,IsRelatedTo,10.15784/601437,"Titan Dome, East Antarctica, Aerogeophysical Survey",scholex,Unknown Repository
6231,IsRelatedTo,10.5285/a72a50c6-a829-4e12-9f9a-5a683a1acc4a,"BEDMAP3 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository
6232,IsRelatedTo,10.6084/m9.figshare.c.5692678.v1,The International Bathymetric Chart of the Southern Ocean Version 2 (IBCSO v2),scholex,figshare
6233,IsRelatedTo,10.5285/925ac4ec-2a9d-461a-bfaa-6314eb0888c8,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository


In [4]:
# function to compare latest with previous results for a particular DOI
v2_results = pd.read_csv("/home/matnic/Projects/citationNotebook/Results/v2/latest_results.csv")

def compare_results(doi, latest_results, v2_results):

    latest_result = latest_results[latest_results['data_doi'] == doi]
    print(latest_result['data_title'].iloc[0])
    latest_result = latest_result[[# 'relation_type_id', 
                    'publication_doi',
        'publication_title', # 'publication_date', 'publication_authors',
        'citation_event_source', 'pub_publisher', 
        #'publication_type'
        ]]
    
    latest_result_n_citations = len(latest_result)
    
    print(f"Latest result citations: {latest_result_n_citations} ")
    with pd.option_context('display.max_colwidth', 400):
        display(latest_result.sort_values(['publication_title']))

    v2_result = v2_results[v2_results['data_doi'] == doi]
    print(v2_result['data_Title'].iloc[0])
    v2_result = v2_result[[# 'relation_type_id', 
                    'publication_doi',
        'publication_title', # 'publication_date', 'publication_authors',
        'citation_event_source', # 'pub_publisher', 
        #'publication_type'
        ]]
    
    v2_result_n_citations = len(v2_result)
    
    print(f"v2result citations: {v2_result_n_citations} ")
    with pd.option_context('display.max_colwidth', 400):
        display(v2_result.sort_values(['publication_title']))


In [36]:
doi = "10.5285/2d0e4791-8e20-46a3-80e4-f5f6716025d2"
compare_results(doi, latest_results, v2_results)

BEDMAP3 - Ice thickness, bed and surface elevation for Antarctica - gridding products
Latest result citations: 27 


Unnamed: 0,publication_doi,publication_title,citation_event_source,pub_publisher
6243,10.5194/essd-15-2695-2023,"Antarctic Bedmap data: Findable, Accessible, Interoperable, and Reusable (FAIR) sharing of 60 years of ice bed, surface, and thickness data",scholex,Earth System Science Data (ESSD)
6247,10.5285/f64815ec-4077-4432-9f55-0ce230f46029,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - standardised data points",scholex,Unknown Repository
6233,10.5285/925ac4ec-2a9d-461a-bfaa-6314eb0888c8,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository
6245,10.5285/2fd95199-365e-4da1-ae26-3b6d48b3e6ac,"BEDMAP2 - Ice thickness, bed and surface elevation for Antarctica - standardised data points",scholex,Unknown Repository
6229,10.5285/0f90d926-99ce-43c9-b536-0c7791d1728b,"BEDMAP2 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository
6242,10.5285/91523ff9-d621-46b3-87f7-ffb6efcd1847,"BEDMAP3 - Ice thickness, bed and surface elevation for Antarctica - standardised data points",scholex,Unknown Repository
6231,10.5285/a72a50c6-a829-4e12-9f9a-5a683a1acc4a,"BEDMAP3 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages",scholex,Unknown Repository
6236,10.1029/2000jb900449,BEDMAP: A new ice thickness and subglacial topographic model of Antarctica,scholex,Journal of Geophysical Research Atmospheres
6244,10.5285/6549203d-da8b-4a22-924b-a9e1471ea7f1,"Bed, surface elevation and ice thickness measurements derived from Radar acquired during the ICEGRAV-2013 airborne geophysics campaign",scholex,Unknown Repository
6237,10.5285/7c12898d-7e55-458c-ba7d-ecec8252f3b5,"Bed, surface elevation and ice thickness measurements derived from radar data acquired during the Thwaites Glacier airborne survey (2019/2020)",scholex,Unknown Repository


IndexError: single positional indexer is out-of-bounds

In [37]:
# code to compare results for a data doi - list the new pub_doi, the pub_dois that have gone, and the number pub_dois that have remained the same
doi = "10.5285/fa5d606c-dc95-47ee-9016-7a82e446f2f2"
latest_result = latest_results[latest_results['data_doi'] == doi].copy()
v2_result = v2_results[v2_results['data_doi'] == doi].copy()

# latest_result['index'] = "latest"
# v2_result['index'] = "old"
# combined_df = pd.concat([latest_result, v2_result], ignore_index=True)
# deduped_df = combined_df.drop_duplicates(subset=list(("data_doi", "publication_doi")), keep="first")
# deduped_df[deduped_df['index'] == "old"]

# sets of DOIs
latest_dois = set(latest_result['publication_doi'])
old_dois = set(v2_result['publication_doi'])

# compare
new_dois = latest_dois - old_dois
disappeared_dois = old_dois - latest_dois
unchanged_dois = latest_dois & old_dois

print(f"New DOIs: {len(new_dois)}")
print(f"Disappeared DOIs: {len(disappeared_dois)}")
print(f"Unchanged DOIs: {len(unchanged_dois)}")

new_pubs = latest_result[latest_result['publication_doi'].isin(new_dois)][['publication_doi', 'publication_title']]
disappeared_pubs = v2_result[v2_result['publication_doi'].isin(disappeared_dois)][['publication_doi', 'publication_title']]
unchanged_pubs = latest_result[latest_result['publication_doi'].isin(unchanged_dois)][['publication_doi', 'publication_title']]

print("\nNew Publications:")
with pd.option_context('display.max_colwidth', 400):
    display(new_pubs)

print("\nDisappeared Publications:")
with pd.option_context('display.max_colwidth', 400):
    display(disappeared_pubs)

print("\nUnchanged Publications:")
with pd.option_context('display.max_colwidth', 400):
    display(unchanged_pubs)


summary = pd.concat([
    new_pubs.assign(status='new'),
    disappeared_pubs.assign(status='disappeared'),
    unchanged_pubs.assign(status='unchanged')
])

with pd.option_context('display.max_colwidth', 400):
    display(summary.sort_values('status'))



New DOIs: 21
Disappeared DOIs: 2
Unchanged DOIs: 6

New Publications:


Unnamed: 0,publication_doi,publication_title
7259,10.5285/5e2cf315-9265-4605-8643-382f2557009b,"Processed bed elevation picks from airborne radar depth sounding across the Dufek Massif, Pensacola Mountains (1998/99 season)"
7260,10.5067/f5fgut9f5089,"IceBridge HiCARS 1 L2 Geolocated Ice Thickness, Version 1"
7261,10.13127/ires,IRES Italian Radio Echo Sounding
7262,10.5285/0f6f5a45-d8af-4511-a264-b0b35ee34af6,Antarctica's Gamburtsev Province (AGAP) Project - Radio-echo sounding data (2007-2009)
7263,10.15784/601047,Radar Depth Sounder Echograms and Ice Thickness
7264,10.5194/tc-7-375-2013,"Bedmap2: improved ice bed, surface and thickness datasets for Antarctica"
7265,10.7265/n5w95730,"AGASEA Ice Thickness Profile Data from the Amundsen Sea Embayment, Antarctica"
7266,10.5067/gdq0cucvte2q,"IceBridge MCoRDS L2 Ice Thickness, Version 1"
7267,10.5285/84a273d9-8191-4316-b8f6-dc907eb0947a,Processed bed elevation picks from airborne radar depth sounding over the Jutulstraumen rift area (2001/02 season)
7271,10.5067/9ebr2t0vxudg,"IceBridge HiCARS 2 L2 Geolocated Ice Thickness, Version 1"



Disappeared Publications:


Unnamed: 0,publication_doi,publication_title
940,10.5194/tcd-6-4305-2012,"Bedmap2: improved ice bed, surface and thickness datasets for Antarctica"
945,10.5194/essd-2022-355,"Antarctic Bedmap data: FAIR sharing of 60 years of ice bed, surface and thickness data"



Unchanged Publications:


Unnamed: 0,publication_doi,publication_title
7268,10.5285/908bb17f-467c-42bf-ae00-f03bb0feea23,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - gridding products"
7269,10.5285/2fd95199-365e-4da1-ae26-3b6d48b3e6ac,"BEDMAP2 - Ice thickness, bed and surface elevation for Antarctica - standardised data points"
7270,10.5285/0f90d926-99ce-43c9-b536-0c7791d1728b,"BEDMAP2 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages"
7273,10.5285/925ac4ec-2a9d-461a-bfaa-6314eb0888c8,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - standardised shapefiles and geopackages"
7276,10.1029/2000jb900449,BEDMAP: A new ice thickness and subglacial topographic model of Antarctica
7281,10.5285/f64815ec-4077-4432-9f55-0ce230f46029,"BEDMAP1 - Ice thickness, bed and surface elevation for Antarctica - standardised data points"


Unnamed: 0,publication_doi,publication_title,status
945,10.5194/essd-2022-355,"Antarctic Bedmap data: FAIR sharing of 60 years of ice bed, surface and thickness data",disappeared
940,10.5194/tcd-6-4305-2012,"Bedmap2: improved ice bed, surface and thickness datasets for Antarctica",disappeared
7259,10.5285/5e2cf315-9265-4605-8643-382f2557009b,"Processed bed elevation picks from airborne radar depth sounding across the Dufek Massif, Pensacola Mountains (1998/99 season)",new
7285,10.5285/4efa688e-7659-4cbf-a72f-facac5d63998,"Airborne radar bed elevation along flow lines covering the Evans, and Rutford Ice Streams, and ice rises in the Ronne Ice Shelf (2006/07)",new
7284,10.5285/7946c497-72fc-41cb-a9b2-bf9980efe156,Processed bed elevation picks from airborne radar depth sounding across the Institute and Moller Glacier catchments in 2010/11,new
7283,10.5285/3adb739a-9eda-434d-9883-03ab092cabae,Processed bed elevation picks from airborne radar depth sounding across the Pine Island Glacier basin (2004/05 season),new
7282,10.1594/ieda/313685,Processed Ice Penetrating Radar Altimeter Data from the Gamburtsev Mountainsin Antarctica acquired during the GAMBIT Twin Otter expedition AGAP_GAMBIT (2008),new
7280,10.5285/2c261013-9a0e-447d-a5bb-b506610b14ff,"Processed bed elevation picks from airborne radar depth sounding across the Evans Ice Stream, Southern Palmer Land (1994/95 season)",new
7279,10.1594/ieda/306567,Processed Ice LayerThickness Data acquired during the SOAR Twin Otter expedition SOAR-LVS (2000),new
7277,10.1594/pangaea.972094,Collection of datasets from AWI's radio-echo sounding systems on ice sheets and glaciers,new


In [None]:
latest_result

In [26]:
len(latest_results)

12725

In [5]:
# compare number of citations for new and old results

# count citations per DOI in each dataset
latest_counts = latest_results.groupby('data_doi').size().reset_index(name='latest_count')
v2_counts = v2_results.groupby('data_doi').size().reset_index(name='v2_count')

# merge them together (outer join keeps all DOIs)
merged = pd.merge(latest_counts, v2_counts, on='data_doi', how='outer').fillna(0)

# calculate the difference
merged['diff'] = merged['latest_count'] - merged['v2_count']

# How many DOIs changed by how much?
merged['diff'].value_counts().sort_index()




diff
-250.0     1
-129.0     1
-114.0     1
-97.0      1
-86.0      1
          ..
 17.0      1
 18.0     21
 19.0      2
 20.0      1
 27.0      1
Name: count, Length: 65, dtype: int64

In [6]:

#merged['diff'].hist(bins=20)
pd.cut(merged['diff'], bins=[-10, -1, 0, 1, 2, 5, 10, 100]).value_counts().sort_index()



diff
(-10, -1]    798
(-1, 0]      762
(0, 1]       412
(1, 2]       185
(2, 5]       296
(5, 10]       34
(10, 100]     32
Name: count, dtype: int64

In [7]:
merged.nlargest(10, 'diff')

Unnamed: 0,data_doi,latest_count,v2_count,diff
406,10.5285/2d0e4791-8e20-46a3-80e4-f5f6716025d2,27.0,0.0,27.0
431,10.5285/2fd95199-365e-4da1-ae26-3b6d48b3e6ac,30.0,10.0,20.0
563,10.5285/3c636579-0389-4ba1-bf3d-d53f32892079,19.0,0.0,19.0
2566,10.5285/fa5d606c-dc95-47ee-9016-7a82e446f2f2,27.0,8.0,19.0
136,10.5285/0f90d926-99ce-43c9-b536-0c7791d1728b,30.0,12.0,18.0
183,10.5285/14b142da-9341-471b-af7a-8613f0fbbb25,18.0,0.0,18.0
331,10.5285/236cbe3a-5f05-4cef-8e92-7145b0190f0b,18.0,0.0,18.0
417,10.5285/2e870dac-9eb1-4f1d-a5ff-fe18090afd79,18.0,0.0,18.0
646,10.5285/463ea06f-6c49-4d6e-a80b-21c88a71c3da,18.0,0.0,18.0
666,10.5285/47d0718d-7146-44d3-965c-60e62a48b8cc,18.0,0.0,18.0


In [23]:
results_list[0]

[0, '10.5285/2641515f-5b76-445c-a936-1da51bf365ad']