In [None]:
from pathlib import Path
import aiohttp, asyncio
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

In [None]:
flm_pth = Path("/home/george/codes/lepinet/data/flemming_ucloud/images")

In [None]:
async def get_key(session, scientificName=None, usageKey=None, rank='SPECIES', order='Lepidoptera'):
    url = "https://api.gbif.org/v1/species/match?"
    assert usageKey is not None or scientificName is not None, "One of scientificName or usageKey must be defined."

    if usageKey is not None:
        url += f"usageKey={usageKey}&"
    if scientificName is not None:
        if scientificName=='Tethea or': return 5142971 # bug fix
        url += f"scientificName={scientificName}&"
    if rank is not None:
        url += f"rank={rank}&"
    if order is not None:
        url += f"order={order}"

    async with session.get(url) as response:
        r = await response.json()
        # return r if not 'canonicalName' in r.keys() else r['canonicalName']
        return r if not 'speciesKey' in r.keys() else r['speciesKey']
        # if not 'speciesKey' in r.keys():
        #     print(scientificName,r)
        #     raise
        # else:
        #     r['speciesKey']

async def get_all_keys(vocab):
    async with aiohttp.ClientSession() as session:
        tasks = [get_key(session, scientificName=k, rank=None) for k in vocab]
        return await asyncio.gather(*tasks)

async def get_sn(session, usageKey):
    url = "https://api.gbif.org/v1/species/{}/name"
    if usageKey is not None:
        url = url.format(usageKey)
        
    async with session.get(url) as response:
        r = await response.json()
        # return r if not 'canonicalName' in r.keys() else r['canonicalName']
        # return r if not 'scientificName' in r.keys() else r['scientificName']
        # return r if not 'canonicalNameWithMarker' in r.keys() else r['canonicalNameWithMarker']
        # return r if not 'scientificName' in r.keys() else r['genus'] + ' ' + r['specificEpithet']
        return r

async def get_all_sn(vocab):
    async with aiohttp.ClientSession() as session:
        tasks = [get_sn(session, usageKey=k) for k in vocab]
        return await asyncio.gather(*tasks)

In [None]:
# Check the request output for a wrong and a good GBIF ID
test_keys = ['8237987', '1811896']
res = asyncio.run(get_all_sn(test_keys))

In [None]:
res

In [None]:
res = asyncio.run(get_all_keys(['Spilosoma lubricipeda']))

In [None]:
res

In [None]:
spc_sn1 = asyncio.run(get_all_sn(spc_k1))

In [None]:
spc_k1 = [f.name for f in flm_pth.glob('*')]

In [None]:
for k in spc_sn1:
    if k['rankMarker']!='sp.': print(k)
    if k['canonicalName']=='Tethea spec.': print(k)

In [None]:
# spc_sn1 = [r['canonicalName'] for r in spc_sn1]
spc_sn2 = [" ".join(r['scientificName'].split(' ')[:2]) for r in spc_sn1]

In [None]:
spc_k2 = asyncio.run(get_all_keys(spc_sn2))

In [None]:
old2new = {}
for k1,k2 in zip(spc_k1,spc_k2):
    if str(k1)!=str(k2): old2new[int(k1)]=int(k2)

In [None]:
# THAT IS THE MISTAKE CASES:
old2new

In [None]:
# Rename the predictions for the ukdk 
ukdk_path = Path("/home/george/codes/lepinet/data/flemming_ucloud/ukdk/ukdk.csv")
ukdk_df = pd.read_csv(ukdk_path)

In [None]:
ukdk_df.head()

In [None]:
n_replaced = (ukdk_df['label'].isin(old2new)).sum()
n_replaced

In [None]:
ukdk_df['label'] = ukdk_df['label'].replace(old2new)

In [None]:
ukdk_path.with_name('ukdk_corrected.csv')

In [None]:
ukdk_df.to_csv(ukdk_path.with_name('ukdk_corrected.csv'), index=False)