In [22]:
import numpy as np
import pandas as pd

# file_path = '~/gnharvester/data-migrate/logs/open-tree-data/taxonomy-10000.tsv'
file_path = '~/gnharvester/data-migrate/logs/open-tree-data/taxonomy-100000.tsv'
# file_path = '~/gnharvester/data-migrate/logs/open-tree-data/taxonomy.tsv'
df = pd.read_csv(file_path, delimiter='\t\|\t?', engine='python')
df.drop('Unnamed: 7', axis=1, inplace=True)

In [23]:
df.shape

(100000, 7)

In [24]:
df['sources_count'] = df['sourceinfo'].str.split(',').map(lambda x: len(x))

In [25]:
def extract_source_id(source_info, source_name):
    src_id = (src_id.split(':')[1] for src_id in source_info if src_id.startswith(source_name))
    return next(src_id, None)

sources = df['sourceinfo'].str.split(',')
source_names = ["ncbi", "silva", "worms", "irmng", "gbif"]
for source_name in source_names:
    source_ids_df = sources.map(lambda source_info: extract_source_id(source_info, source_name))
    df[source_name] = source_ids_df

In [26]:
df[source_names].head(20)

Unnamed: 0,ncbi,silva,worms,irmng,gbif
0,1220095,,,,
1,1723552,,,,
2,394873,,,,
3,400652,,,,
4,1702258,,,,
5,1114402,,,,
6,710063,,,,
7,1486387,,,,
8,1486265,,,,
9,412885,,,,


In [27]:
def parse(names):
    from pyspark.mllib.common import _py2java, _java2py
    parser = sc._jvm.org.globalnames.parser.spark.Parser()
    result = parser.parse(_py2java(sc, names))
    return _java2py(sc, result)

import re

names_cleaned = df['name'].map(lambda n: re.sub('\s+', ' ', n.strip()))

names_par = sc.parallelize(names_cleaned)

import json
                       
names_json = parse(names_par) \
    .map(lambda r: json.loads(r))
    
df['verbatim'] = pd.Series(names_json.map(lambda j: j['verbatim']).collect())
df['name_string_id'] = pd.Series(names_json.map(lambda j: j['name_string_id']).collect())

In [28]:
import uuid
namespace = uuid.UUID('90181196-fecf-5082-a4c1-411d4f314cda')

def get_canonical(j):
    if j.get('canonical_name'):
        return j['canonical_name']['value']
    else:
        return None

def get_canonical_uuid(j):
    can = get_canonical(j)
    if can:
        return uuid.uuid5(namespace, get_canonical(j).encode('utf8'))
    else:
        return None
    
df['canonical'] = pd.Series(names_json.map(get_canonical).collect())
df['canonical_uuid'] = pd.Series(names_json.map(get_canonical_uuid).collect())

In [40]:
df['verbatim_clean'] = df['verbatim'].map(lambda n: re.sub('\s+', ' ', n))
df['data_source_transitive_id'] = 177 # tree of life
df['data_source_transitive_url'] = ''

for (source_id, source_name) in zip([4, 178, 9, 8, 11], source_names):
    target_id = 'data_source_target_id-' + source_name
    df[target_id] = source_id

df_source = df.loc[df['sources_count'] > 2]

df_source.to_csv('crossmap/name_strings.tsv', sep='\t', encoding='utf8', \
                 header=False, index=False, mode='w', \
                 columns=['name_string_id', 'name_string_id', 'verbatim_clean', 'canonical_uuid', 'canonical'])

for (source_id, source_name) in zip([4, 178, 9, 8, 11], source_names):
    print source_id, source_name
    
    target_id = 'data_source_target_id-' + source_name

    df_source.loc[df_source[source_name].notnull()] \
        .to_csv('crossmap/name_string_indices-{0}.tsv'.format(source_name), sep='\t', encoding='utf8', \
                header=False, index=False, mode='w', \
                columns=['data_source_transitive_id', 'name_string_id', 'data_source_transitive_url'])

    df_source.loc[df_source[source_name].notnull()] \
        .to_csv('crossmap/cross_map-{0}.tsv'.format(source_name), sep='\t', encoding='utf8', \
                header=False, index=False, mode='w', \
                columns=[target_id, 'name_string_id', source_name])

df_source.shape

4 ncbi
178 silva
9 worms
8 irmng
11 gbif


(1231, 25)

In [41]:
files_cross_map = " ".join(['crossmap/cross_map-{0}.tsv'.format(x) for x in source_names])
# !cat $files_cross_map | wc -l
# !cat $files_cross_map | uniq | wc -l
!cat $files_cross_map > crossmap/cross_map.tsv

files_name_string_indices = " ".join(['crossmap/name_string_indices-{0}.tsv'.format(x) for x in source_names])
# !cat $files_name_string_indices | wc -l
# !cat $files_name_string_indices | uniq | wc -l
!cat $files_name_string_indices > crossmap/name_string_indices.tsv