## Machine Learning Record Mining

Project to create a pipeline that uses GeoDeepDive's output to find Unaquired Sites for Neotoma.

Using NLP parsed text and a Data Science approach, identify whether a paper is suitable for Neotoma and detect features such as 'Site Name', 'Location', 'Age Span' and 'Site Descriptions'.

In [1]:
# Loading libraries

import numpy as np
import pandas as pd
import csv
import psycopg2
import re

# Load Postgres Server
from src.config import config

In [42]:
# Options for DF display
#pd.set_option('display.max_colwidth', 300)
#pd.set_option('display.max_rows', 10)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

## Loading and viewing the Data

### Loading NLP Sentences

In [3]:
# Connect to PostgreSQL server from terminal:
# pg_ctl -D PSQL_Data -l logfile start

try:
    params = config()
    # Connect to the PostgreSQL database
    conn = psycopg2.connect(**params)
    # Create a new cursor
    cur = conn.cursor()
    nlp_sentences = pd.read_sql_query('''SELECT * FROM sentences;''', conn)
    # Close the cursor and connection to so the server can allocate
    # bandwidth to other requests
    cur.close()
    conn.close()
    print('Data obtained from PostgreSQL')
    
# If no SQL db, load from a file
except:
    header_list = ["_gddid", "sentid", "wordidx", "words", "part_of_speech", "special_class", 
               "lemmas", "word_type", "word_modified"]
    nlp_sentences = pd.read_csv("../Do_not_commit_data/sentences_nlp352", sep='\t', names = header_list)
    nlp_sentences = nlp_sentences.replace('"', '', regex = True)\
                                 .replace('\{', '', regex = True)\
                                 .replace('}', '', regex = True)\
                                 .replace(',', ',', regex = True)
    nlp_sentences['wordidx']= nlp_sentences['wordidx'].str.split(",")
    nlp_sentences['words']= nlp_sentences['words'].str.split(",")
    nlp_sentences['poses']= nlp_sentences['poses'].str.split(",")
    nlp_sentences['ners']= nlp_sentences['ners'].str.split(",")
    nlp_sentences['lemmas']= nlp_sentences['lemmas'].str.split(",")
    nlp_sentences['dep_paths']= nlp_sentences['dep_paths'].str.split(",")
    nlp_sentences['dep_parents']= nlp_sentences['dep_parents'].str.split(",")
    print('Data obtained from text file')

Data obtained from PostgreSQL


In [4]:
nlp_sentences.head(10)

Unnamed: 0,_gddid,sentid,wordidx,words,part_of_speech,special_class,lemmas,word_type,word_modified
0,54b43266e138239d8684efed,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, ...","[Available, online, at, www.sciencedirect.com, Quaternary, Research, 69, -LRB-, 2008, -RRB-, 263, --, 275, www.elsevier.com/locate/yqres, Development, of, the, mixed, conifer, forest, in, northern, New, Mexico, and, its, relationship, to, Holocene, environmental, change, R., Scott, Anderson, a, ...","[JJ, NN, IN, NNP, NNP, NNP, CD, -LRB-, CD, -RRB-, CD, :, CD, NNS, NN, IN, DT, JJ, NN, NN, IN, JJ, NNP, NNP, CC, PRP$, NN, TO, NNP, JJ, NN, NNP, NNP, NNP, DT, ,, NN, ,, ,, NNP, NNP, NNP, NN, CD, ,, NNP, NNP, NNP, NN, CD, ,, NNP, NNP, NNP, NN, ,, NNP, NNP, NNP, NN, ,, NNP, NNP, NN, ,, NNP, NNP, NN...","[O, O, O, O, O, O, NUMBER, O, DATE, O, NUMBER, O, NUMBER, O, O, O, O, O, O, O, O, O, LOCATION, LOCATION, O, O, O, O, O, O, O, PERSON, PERSON, PERSON, O, O, O, O, O, PERSON, PERSON, PERSON, O, NUMBER, O, PERSON, PERSON, PERSON, O, NUMBER, O, PERSON, PERSON, PERSON, O, O, PERSON, PERSON, PERSON, O...","[available, online, at, www.sciencedirect.com, Quaternary, Research, 69, -lrb-, 2008, -rrb-, 263, --, 275, www.elsevier.com/locate/yqres, development, of, the, mixed, conifer, forest, in, northern, New, Mexico, and, its, relationship, to, Holocene, environmental, change, R., Scott, Anderson, a, ...","[dep, dep, dep, dep, dep, dep, dep, , dep, , dep, , dep, dep, dep, dep, dep, dep, dep, dep, dep, dep, dep, dep, cc, dep, dep, dep, dep, dep, dep, dep, dep, dep, dep, , dep, , , dep, dep, dep, dep, dep, , dep, dep, dep, dep, dep, , dep, dep, dep, dep, , dep, dep, dep, dep, , dep, dep, dep, , dep,...","[218, 218, 218, 218, 218, 218, 218, 0, 218, 0, 218, 0, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 218, 0, 218, 0, 0, 218, 218, 218, 218, 218, 0, 218, 218, 218, 218, 218, 0, 218, 218, 218, 218, 0, 218, 218, 218, 218, 0, 218, 218, ..."
1,54b43266e138239d8684efed,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","[The, Chihuahueños, Bog, record, extends, to, over, 15,000, cal, yr, BP, .]","[DT, NNP, NN, NN, VBZ, TO, IN, CD, JJ, NN, NN, .]","[O, O, O, O, O, O, O, NUMBER, O, DURATION, O, O]","[the, Chihuahueños, bog, record, extend, to, over, 15,000, cal, yr, bp, .]","[det, compound, compound, nsubj, , case, amod, nummod, amod, compound, nmod:to, ]","[4, 4, 4, 5, 0, 11, 11, 11, 11, 11, 5, 0]"
2,54b43266e138239d8684efed,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]","[An, Artemisia, steppe, ,, then, an, open, Picea, woodland, grew, around, a, small, pond, until, ca., 11,700, cal, yr, BP, when, Pinus, ponderosa, became, established, .]","[DT, NNP, NN, ,, RB, DT, JJ, NNP, NN, VBD, IN, DT, JJ, NN, IN, FW, CD, JJ, NN, NN, WRB, NNP, NN, VBD, VBN, .]","[O, O, O, O, O, O, O, LOCATION, LOCATION, O, O, O, O, O, O, O, NUMBER, O, DURATION, ORGANIZATION, O, O, O, O, O, O]","[a, Artemisia, steppe, ,, then, a, open, Picea, woodland, grow, around, a, small, pond, until, ca., 11,700, cal, yr, bp, when, Pinus, ponderosa, become, establish, .]","[det, compound, nsubj, , advmod, det, amod, compound, nsubj, , case, det, amod, nmod:around, case, nmod:until, nummod, amod, compound, dobj, advmod, compound, nsubj, acl:relcl, xcomp, ]","[3, 3, 10, 0, 9, 9, 9, 9, 10, 0, 14, 14, 14, 10, 16, 10, 20, 20, 20, 10, 24, 23, 24, 20, 24, 0]"
3,54b43266e138239d8684efed,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]","[C/N, ratios, ,, δ13C, and, δ15N, values, indicate, both, terrestrial, and, aquatic, organic, matter, was, incorporated, into, the, sediment, .]","[JJ, NNS, ,, NN, CC, NN, NNS, VBP, CC, JJ, CC, JJ, JJ, NN, VBD, VBN, IN, DT, NN, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[c/n, ratio, ,, δ13c, and, δ15n, value, indicate, both, terrestrial, and, aquatic, organic, matter, be, incorporate, into, the, sediment, .]","[amod, compound, , conj:and, cc, conj:and, nsubj, , cc:preconj, nsubjpass, cc, amod, amod, conj:and, auxpass, ccomp, case, det, nmod:into, ]","[7, 7, 0, 2, 2, 2, 8, 0, 10, 16, 10, 14, 14, 10, 16, 8, 19, 19, 16, 0]"
4,54b43266e138239d8684efed,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]","[Higher, percentages, of, aquatic, algae, and, elevated, C/N, ratios, indicate, higher, lake, levels, at, the, opening, of, the, Holocene, ,, but, a, wetland, developed, subsequently, as, climate, warmed, .]","[JJR, NNS, IN, JJ, NN, CC, JJ, NN, NNS, VBP, JJR, NN, NNS, IN, DT, NN, IN, DT, NNP, ,, CC, DT, NN, VBD, RB, IN, NN, VBD, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[higher, percentage, of, aquatic, alga, and, elevated, c/n, ratio, indicate, higher, lake, level, at, the, opening, of, the, Holocene, ,, but, a, wetland, develop, subsequently, as, climate, warm, .]","[amod, nsubj, case, amod, nmod:of, cc, amod, compound, conj:and, , amod, compound, dobj, case, det, nmod:at, case, det, nmod:of, , cc, det, nsubj, conj:but, advmod, mark, nsubj, advcl, ]","[2, 10, 5, 5, 2, 5, 9, 9, 5, 0, 13, 13, 10, 16, 16, 10, 19, 19, 16, 0, 10, 23, 24, 10, 24, 28, 28, 24, 0]"
5,54b43266e138239d8684efed,294,"[1, 2, 3, 4, 5, 6]","[Anderson, ,, R.S., ,, 1989, .]","[NNP, ,, NNP, ,, CD, .]","[PERSON, O, PERSON, O, DATE, O]","[Anderson, ,, R.S., ,, 1989, .]","[compound, , , , amod, ]","[3, 0, 0, 0, 3, 0]"
6,54b43266e138239d8684efed,6,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]","[From, ca., 8500, to, 6400, cal, yr, BP, the, pond, desiccated, in, what, must, have, been, the, driest, period, of, the, Holocene, there, .]","[IN, FW, CD, TO, CD, JJ, NN, NN, DT, NN, VBN, IN, WP, MD, VB, VBN, DT, JJS, NN, IN, DT, NNP, RB, .]","[O, O, NUMBER, O, NUMBER, O, DURATION, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[from, ca., 8500, to, 6400, cal, yr, bp, the, pond, desiccate, in, what, must, have, be, the, driest, period, of, the, Holocene, there, .]","[case, nmod:from, compound, dep, nummod, amod, compound, nsubj, det, dep, acl, case, nmod:in, aux, aux, cop, det, amod, , case, det, nmod:of, nmod:tmod, ]","[2, 19, 5, 5, 8, 8, 8, 19, 10, 8, 8, 13, 11, 19, 19, 19, 19, 19, 0, 22, 22, 19, 19, 0]"
7,54b43266e138239d8684efed,7,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]","[C/N, ratios, declined, to, their, lowest, Holocene, levels, ,, indicating, intense, decomposition, in, the, sediment, .]","[JJ, NNS, VBD, TO, PRP$, JJS, NN, NNS, ,, VBG, JJ, NN, IN, DT, NN, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[c/n, ratio, decline, to, they, lowest, holocene, level, ,, indicate, intense, decomposition, in, the, sediment, .]","[amod, nsubj, , case, nmod:poss, amod, compound, nmod:to, , xcomp, amod, dobj, case, det, nmod:in, ]","[2, 3, 0, 8, 8, 8, 8, 3, 0, 3, 12, 10, 15, 15, 10, 0]"
8,54b43266e138239d8684efed,8,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]","[Wetter, conditions, returned, after, 6400, cal, yr, BP, ,, with, conversion, of, the, site, to, a, sedge, bog, as, groundwater, levels, rose, .]","[JJ, NNS, VBD, IN, CD, JJ, NN, NN, ,, IN, NN, IN, DT, NN, TO, DT, NN, NN, IN, NN, NNS, VBD, .]","[O, O, O, O, DATE, O, DURATION, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[wetter, condition, return, after, 6400, cal, yr, bp, ,, with, conversion, of, the, site, to, a, sedge, bog, as, groundwater, level, rise, .]","[amod, nsubj, , case, nummod, amod, compound, nmod:after, , case, nmod:with, case, det, nmod:of, case, det, compound, nmod:to, mark, compound, nsubj, advcl, ]","[2, 3, 0, 8, 8, 8, 8, 3, 0, 11, 8, 14, 14, 11, 18, 18, 18, 14, 22, 21, 22, 3, 0]"
9,54b43266e138239d8684efed,9,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]","[Higher, charcoal, influx, rates, after, 6400, cal, yr, BP, probably, result, from, greater, biomass, production, rates, .]","[JJR, NN, NN, NNS, IN, CD, JJ, NN, NN, RB, VBP, IN, JJR, NN, NN, NNS, .]","[O, O, O, O, O, DATE, O, DURATION, O, O, O, O, O, O, O, O, O]","[higher, charcoal, influx, rate, after, 6400, cal, yr, bp, probably, result, from, greater, biomass, production, rate, .]","[amod, compound, compound, nsubj, case, nummod, amod, compound, nmod:after, advmod, , case, amod, compound, compound, nmod:from, ]","[4, 4, 4, 11, 9, 9, 9, 9, 4, 9, 0, 16, 16, 16, 16, 11, 0]"


### Loading Bibliography Data

In [5]:
import json
with open('../Do_not_commit_data/bibjson', 'r') as f:
    bib_dict = json.load(f)

In [6]:
# Normalizing data so that we have access to the 'identifier'

# TODO Load into SQL server and connect through SQL
bibliography = pd.io.json.json_normalize(bib_dict,'identifier',['publisher', 'title', ['journal','name', 'name'], ['author'], 'year', 'number', 'volumne', ['link'], '_gddid', 'type', 'pages'], record_prefix='_', errors='ignore')

bibliography['url'] = bibliography['link'].str.split("'type'", expand = True)[0]

In [7]:
# TODO Flatten 'URL' 

bibliography.head(10)

Unnamed: 0,_type,_id,publisher,title,journal.name.name,author,year,number,volumne,link,_gddid,type,pages,url
0,doi,10.1016/S0277-3791(99)00007-4,Elsevier,"Palaeoclimate, chronology and vegetation history of the Weichselian Lateglacial: comparative analysis of data from three cores at Lago Grande di Monticchio, southern Italy",Quaternary Science Reviews,"[{'name': 'Huntley, B.'}, {'name': 'Watts, W.A.'}, {'name': 'Allen, J.R.M.'}, {'name': 'Zolitschka, B.'}]",1999,7,,"[{'url': 'http://www.sciencedirect.com/science/article/pii/S0277379199000074', 'type': 'publisher'}]",550453fde1382326932d85f7,article,945--960,
1,doi,10.1139/b92-002,Canadian Science Publishing,"Holocene history of forest trees on the Bruce Peninsula, southern Ontario",Canadian Journal of Botany,"[{'name': 'Bennett, K. D.'}]",1992,1,,"[{'url': 'http://www.nrcresearchpress.com/doi/abs/10.1139/b92-002', 'type': 'publisher'}]",578b5aabcf58f1587003efba,article,6--18,
2,doi,10.1016/0031-0182(80)90043-7,Elsevier,Glacial sequence and environmental history in the Sierra Nevada del cocuy (Colombia),"Palaeogeography, Palaeoclimatology, Palaeoecology","[{'name': 'Van Der Hammen, T.'}, {'name': 'Barelds, J.'}, {'name': 'De Jong, H.'}, {'name': 'De Veer, A.A.'}]",1980,,,"[{'url': 'http://www.sciencedirect.com/science/article/pii/0031018280900437', 'type': 'publisher'}]",54b43244e138239d868493cd,article,247--340,
3,doi,10.1130/g35541.1,GSA,"A record of sustained prehistoric and historic land use from the Cahokia region, Illinois, USA",Geology,"[{'name': 'Munoz, S. E.'}, {'name': 'Schroeder, S.'}, {'name': 'Fike, D. A.'}, {'name': 'Williams, J. W.'}]",2014,6,,"[{'url': 'http://dx.doi.org/10.1130/g35541.1', 'type': 'publisher'}]",57c5b941cf58f1338eaddb5b,article,499--502,
4,doi,10.1016/0033-5894(81)90128-9,Elsevier,Plant and insect fossils at Norwood in south-central Minnesota: A record of late-glacial succession,Quaternary Research,"[{'name': 'Ashworth, Allan C.'}, {'name': 'Schwert, Donald P.'}, {'name': 'Watts, William A.'}, {'name': 'Wright, H.E.'}]",1981,1,,"[{'url': 'http://www.sciencedirect.com/science/article/pii/0033589481901289', 'type': 'publisher'}]",54b43269e138239d8684f87f,article,66--79,
5,doi,10.1080/00173130903435846,Taylor and Francis,10. Na Bahně (Czech Republic): Vegetation development over the last 2.5 millennia in the Eastern Bohemian lowland,Grana,"[{'name': 'Pokorný, Petr'}, {'name': 'van der Knaap, Willem O.'}]",2010,1,,"[{'url': 'http://www.tandfonline.com/doi/abs/10.1080/00173130903435846', 'type': 'publisher'}]",58d27c30cf58f121bb4ff5db,article,79--81,
6,doi,10.1139/e98-095,Canadian Science Publishing,Age verification of the Lake Gribben forest bed and the Younger Dryas Advance of the Laurentide Ice Sheet,Canadian Journal of Earth Sciences,"[{'name': 'Lowell, Thomas V'}, {'name': 'Larson, Graham J'}, {'name': 'Hughes, John D'}, {'name': 'Denton, George H'}]",1999,3,,"[{'url': 'http://www.nrcresearchpress.com/doi/abs/10.1139/e98-095', 'type': 'publisher'}]",574629c5cf58f15d3f588144,article,383--393,
7,doi,10.1016/j.yqres.2013.03.001,Elsevier,"Holocene vegetation history and fire regimes of Pseudotsuga menziesii forests in the Gulf Islands National Park Reserve, southwestern British Columbia, Canada",Quaternary Research,"[{'name': 'Lucas, Jennifer D.'}, {'name': 'Lacourse, Terri'}]",2013,3,,"[{'url': 'http://www.sciencedirect.com/science/article/pii/S0033589413000239', 'type': 'publisher'}]",54b43265e138239d8684ee17,article,366--376,
8,doi,10.1016/j.yqres.2009.05.001,Elsevier,"Synchronization of late-glacial vegetation changes at Crystal Lake, Illinois, USA with the North Atlantic Event Stratigraphy",Quaternary Research,"[{'name': 'Gonzales, Leila M.'}, {'name': 'Grimm, Eric C.'}]",2009,2,,"[{'url': 'http://www.sciencedirect.com/science/article/pii/S0033589409000532', 'type': 'publisher'}]",54b43265e138239d8684ef96,article,234--245,
9,doi,10.1002/jqs.979,Wiley,Contrasting pollen histories of MIS 5e and the Holocene from Lake Titicaca (Bolivia/Peru),Journal of Quaternary Science,"[{'name': 'Hanselman, Jennifer A.'}, {'name': 'Gosling, William D.'}, {'name': 'Paduano, Gina M.'}, {'name': 'Bush, Mark B.'}]",2005,7-8,,"[{'url': 'http://doi.wiley.com/10.1002/jqs.979', 'type': 'publisher'}]",56f8f62ccf58f1710ac939e0,article,663--670,


## EDA

Reviewing our data includes skimming through some papers online and seeing if the data is consistent with our NLP Sentences dataframe.

From there, we can also visualize what we would like our model to predict: 'Location', 'Site Name', 'Age Span', and 'Site Description' from a "Human perspective".

In [8]:
def order_article(article_id):
    '''
    Function to find an article by its gddid in the NLP sentences and have it displayed in order
    
    Keyword arguments:
    article_id -- gddid
    
    Returns:
    article ordered by sentence index
    '''
    article = nlp_sentences[nlp_sentences['_gddid'] == article_id]
    return article[['sentid', 'words']].sort_values(by = 'sentid')

In [9]:
# Article 01 
order_article('550453fde1382326932d85f7')

Unnamed: 0,sentid,words
94865,1,"[Quaternary, Science, Reviews, 18, -LRB-, 1999, -RRB-, 945, -RCB-, 960, Palaeoclimate, ,, chronology, and, vegetation, history, of, the, Weichselian, Lateglacial, :, comparative, analysis, of, data, from, three, cores, at, Lago, Grande, di, Monticchio, ,, southern, Italy, B., Huntley, *, ,, W.A...."
94866,2,"[Plant, macrofossil, data, from, one, core, provide, complementary, evidence, of, the, palaeovegetation, .]"
94867,3,"[Quantitative, palaeoclimate, reconstructions, are, made, from, the, pollen, data, using, pollen-climate, response, surfaces, .]"
94868,4,"[The, chronological, framework, for, Lateglacial, events, as, well, as, the, palaeoclimate, are, compared, with, other, sources, of, evidence, of, Lateglacial, palaeoenvironments, .]"
94870,5,"[Surface, conditions, in, the, North, Atlantic, appear, to, be, more, directly, linked, to, climatic, conditions, in, Italy, than, is, the, temperature, in, Greenland, .]"
...,...,...
95325,448,"[Vegetation, history, and, climate, of, the, last, 15,000, years, at, Laghi, di, Monticchio, ,, southern, Italy, .]"
95326,449,"[Quaternary, Science, Reviews, ,, 15, ,, 133, -RCB-, 153, .]"
95327,450,"[Zolitschka, ,, B., ,, &, Negendank, ,, J., F., W., -LRB-, 1996, -RRB-, .]"
95329,451,"[Sedimentology, ,, dating, and, palaeoclimatic, interpretation, of, a, 76.3, ka, record, from, Lago, Grande, di, Monticchio, ,, southern, Italy, .]"


Skimmed info: 
 * `Article Name:` Palaeoclimate_chronology_and_vegetation_history_of_the_Weichselian_Lateglacial
 * `Link`: http://www.sciencedirect.com/science/article/pii/S0277379199000074  
 * `Site Name`:  Laghi di Monticchio
 * `Location`:  40° 56' 40" N, 15° 36' 48" E
 * `Age Span`: 
 * `Site Descriptions`: The record from Lago Grande di Monticchio now adds macrofossil evidence of the presence of relatively boreal tree taxa during the Lateglacial interstadial and provides a chronology based upon annually laminated sediments.

In [10]:
# Article 2
order_article('54b43266e138239d8684efed')

Unnamed: 0,sentid,words
0,1,"[Available, online, at, www.sciencedirect.com, Quaternary, Research, 69, -LRB-, 2008, -RRB-, 263, --, 275, www.elsevier.com/locate/yqres, Development, of, the, mixed, conifer, forest, in, northern, New, Mexico, and, its, relationship, to, Holocene, environmental, change, R., Scott, Anderson, a, ..."
1,2,"[The, Chihuahueños, Bog, record, extends, to, over, 15,000, cal, yr, BP, .]"
2,3,"[An, Artemisia, steppe, ,, then, an, open, Picea, woodland, grew, around, a, small, pond, until, ca., 11,700, cal, yr, BP, when, Pinus, ponderosa, became, established, .]"
3,4,"[C/N, ratios, ,, δ13C, and, δ15N, values, indicate, both, terrestrial, and, aquatic, organic, matter, was, incorporated, into, the, sediment, .]"
4,5,"[Higher, percentages, of, aquatic, algae, and, elevated, C/N, ratios, indicate, higher, lake, levels, at, the, opening, of, the, Holocene, ,, but, a, wetland, developed, subsequently, as, climate, warmed, .]"
...,...,...
605,572,"[Is, the, Valles, caldera, entering, a, new, cycle, of, activity, ?]"
144,573,"[Geology, 23, ,, 411, --, 414, .]"
607,574,"[Wright, Jr., ,, H.E., ,, Bent, ,, A.M., ,, Hansen, ,, B.S., ,, Maher, Jr., ,, L.J., ,, 1973, .]"
608,575,"[Present, and, past, vegetation, of, the, Chuska, Mountains, ,, northwestern, New, Mexico, .]"


Skimmed info:  
* `Article:` Development of the Mixed Connifer Forest in Northern New Mexico and its relationship to Holocene environmental change
* `Link`: http://www.sciencedirect.com/science/article/pii/S0033589407001512
* `Site Name`:  Chihuahuenos Bog 
* `Location`:  ??? No Given Coordinates 
* `Age Span`:   
* `Site Descriptions`:  Chihuahueños Bog is located within the mixed conifer forest and has affinities to the flora of the southern Rocky Mountains(Foxx and Tierney, 1985; Allen, 2004). Although the area immediately surrounding the bog has been heavily logged in recent years, nearly every conifer species growing in montaneforests of the Jemez mountains is represented either along theshore of the bog or in the surrounding uplands.  

In [11]:
# Article 3
order_article('57c5b941cf58f1338eaddb5b')

Unnamed: 0,sentid,words
67708,1,"[A, record, of, sustained, prehistoric, and, historic, land, use, from, the, Cahokia, region, ,, Illinois, ,, USA, Samuel, E., Munoz1, ,, Sissel, Schroeder2, ,, David, A., Fike3, ,, and, John, W., Williams1, 1Department, of, Geography, ,, University, of, Wisconsin, --, Madison, ,, 550, North, Pa..."
67709,2,"[Here, we, report, a, high-resolution, and, multiproxy, paleoecological, record, from, Horseshoe, Lake, ,, an, oxbow, lake, in, the, central, Mississippi, River, valley, that, is, adjacent, to, the, Cahokia, site, -LRB-, Illinois, ,, USA, -RRB-, ,, the, largest, prehistoric, settlement, north, o..."
67710,3,"[Palynological, and, carbon, isotope, data, document, pronounced, vegetation, changes, over, the, past, 1700, yr, driven, primarily, by, land, use, ,, including, 900, yr, -LRB-, 450, --, 1350, CE, -RRB-, of, sustained, prehistoric, human, impacts, .]"
67712,4,"[Rapid, forest, clearance, was, followed, closely, by, the, proliferation, of, indigenous, seed, crops, of, the, Eastern, Agricultural, Complex, beginning, ca., 450, CE, ,, centuries, before, the, emergence, of, Cahokia, at, 1050, CE, .]"
67713,5,"[Agricultural, intensiﬁcation, that, included, the, use, of, maize, -LRB-, Zea, mays, subsp, .]"
...,...,...
67845,123,"[Simon, ,, M.L., ,, and, Parker, ,, K.E., ,, 2006, ,, Prehistoric, plant, use, in, the, American, Bottom, :, New, thoughts, and, interpretations, :, Southeastern, Archaeology, ,, v., 25, ,, p., 212, --, 257, .]"
67846,124,"[Smith, ,, B.D., ,, and, Yarnell, ,, R.A., ,, 2009, ,, Initial, formation, of, an, indigenous, crop, complex, in, eastern, North, America, at, 3800, B.P, :, National, Academy, of, Sciences, Proceedings, ,, v., 106, ,, p., 6561, --, 6566, ,, doi, :10.1073, /, pnas, .0901846106, .]"
67848,125,"[Sugita, ,, S., ,, 1993, ,, A, model, of, pollen, source, area, for, an, entire, lake, surface, :, Quaternary, Research, ,, v., 39, ,, p., 239, --, 244, ,, doi, :10.1006, /, qres, .1993.1027, .]"
67849,126,"[Trubitt, ,, M.B.D., ,, 2000, ,, Mound, building, and, prestige, goods, exchange, :, Changing, strategies, in, the, Cahokia, chiefdom, :, American, Antiquity, ,, v., 65, ,, p., 669, --, 690, ,, doi, :10.2307, /, 2694421, .]"


Skimmed info:  
* `Link`: http://dx.doi.org/10.1130/g35541.1  # No Full access to article
* `Site Name`:  Cahokia region, Illinois, USA
* `Location`:  N38°42'00" - N38°42'00", W90°04'60" - W90°04'60" - Not found on text (yet).
* `Age Span`:   
* `Site Descriptions`: Record from Horseshoe Lake, an oxbow lake in the central Mississippi River valley that is adjacent to the Cahokia site (Illinois, USA), the largest prehistoric settlement north of Mexico.   

In [12]:
# Article 4
order_article('58d29193cf58f14928755ba5')

Unnamed: 0,sentid,words
110659,1,"[Grana, ISSN, :, 0017-3134, -LRB-, Print, -RRB-, 1651-2049, -LRB-, Online, -RRB-, Journal, homepage, :, http://www.tandfonline.com/loi/sgra20, 6, .]"
110660,2,"[Peat-bog, Begbunar, -LRB-, Osogovo, Mountains, ,, southwest, Bulgaria, -RRB-, :, Four, millennia, of, vegetation, history, Maria, Lazarova, ,, Spassimir, Tonkov, ,, Ian, Snowball, &, Elena, Marinova, To, cite, this, article, :, Maria, Lazarova, ,, Spassimir, Tonkov, ,, Ian, Snowball, &, Elena, ..."
110661,3,"[Peat-bog, Begbunar, -LRB-, Osogovo, Mountains, ,, south-west, Bulgaria, -RRB-, :, Four, millennia, of, vegetation, history, ,, Grana, ,, 48:2, ,, 147-149, ,, DOI, :, 10.1080, /, 00173130902965157, To, link, to, this, article, :, http://dx.doi.org/10.1080/00173130902965157, Published, online, :,..."
110662,4,"[Submit, your, article, to, this, journal, Article, views, :, 182, View, related, articles, Citing, articles, :, 5, View, citing, articles, Full, Terms, &, Conditions, of, access, and, use, can, be, found, at, http://www.tandfonline.com/action/journalInformation?journalCode=sgra20, Download, by,..."
110665,5,"[Peat-bog, Begbunar, -LRB-, Osogovo, Mountains, ,, south-west, Bulgaria, -RRB-, :, Four, millennia, of, vegetation, history, MARIAEPDB6.Begbunar, -LRB-, Bulgaria, -RRB-, LAZAROVA1, ,, SPASSIMIR, TONKOV2, ,, IAN, SNOWBALL3, &, ELENA, MARINOVA4, 1Institute, of, Botany, ,, Bulgarian, Academy, of, S..."
...,...,...
110740,80,"[Ann, .]"
110741,81,"[Sofia, Univ., .]"
110742,82,"[Fac, .]"
110743,83,"[Geol, .]"


In [13]:
# Try to see the exact location. Wont show last few words.
pd.DataFrame(order_article('58d29193cf58f14928755ba5').iloc[4])

Unnamed: 0,110665
sentid,5
words,"[Peat-bog, Begbunar, -LRB-, Osogovo, Mountains, ,, south-west, Bulgaria, -RRB-, :, Four, millennia, of, vegetation, history, MARIAEPDB6.Begbunar, -LRB-, Bulgaria, -RRB-, LAZAROVA1, ,, SPASSIMIR, TONKOV2, ,, IAN, SNOWBALL3, &, ELENA, MARINOVA4, 1Institute, of, Botany, ,, Bulgarian, Academy, of, S..."


Skimmed info:  
* `Link:`http://www.tandfonline.com/doi/abs/10.1080/00173130902965157
* `Site Name`:  Begbunar
* `Location`:  42°09′ N, 22° 33′ E; 1750 ; in SQL file shows as: `42,°,09,cents,N,,,22,°,33,cents,E`
* `Age Span`:   
* `Site Descriptions`: Located in the central treeless zone of the Osogovo mountains, which are situated at the border between south-western Bulgaria and the north-eastern former Yugoslav Republic of Macedonia.

In [14]:
# Article 5
order_article('57928e07cf58f133d1c26609')

Unnamed: 0,sentid,words
39606,1,"[Timberline, fluctuations, and, late, Quaternary, paleoclimates, in, the, Southern, Rocky, Mountains, ,, Colorado, Patricia, L., Fall, *, Department, of, Geography, ,, Arizona, State, University, ,, Tempe, ,, Arizona, 85287, ABSTRACT, Pollen, and, plant, macrofossils, from, eight, sedimentary, b..."
39607,2,"[By, tracking, climatically, sensitive, forest, boundaries, ,, the, moisturecontrolled, lower, timberline, and, the, temperature-controlled, upper, timberline, ,, paleoclimatic, estimates, can, be, derived, from, modern, temperature, and, precipitation, lapse, rates, .]"
39608,3,"[Pollen, data, suggest, that, prior, to, 11, 000, yr, B.P., ,, a, subalpine, forest, dominated, by, Picea, -LRB-, spruce, -RRB-, and, Pinus, -LRB-, pine, -RRB-, grew, 300Ð700, m, below, its, modern, limit, .]"
39609,4,"[The, inferred, climate, was, 2Ð5, ¡, C, cooler, and, had, 7Ð16, cm, greater, precipitation, than, today, .]"
39611,5,"[Abies, -LRB-, fir, -RRB-, increased, in, abundance, in, the, subalpine, forest, around, 11, 000, yr, B.P., ,, probably, in, response, to, cooler, conditions, with, increased, winter, snow, .]"
...,...,...
40107,448,"[Weber, ,, W., A., ,, 1987, ,, Colorado, flora, :, Western, slope, :, Boulder, ,, Colorado, Associated, University, Press, ,, 530, p., Wells, ,, P., V., ,, 1983, ,, Late-Quaternary, vegetation, of, the, Great, Plains, :, Nebraska, Academy, of, Sciences, Transactions, ,, v., XI, ,, p., 83Ð89, .]"
40108,449,"[Whitlock, ,, C., ,, 1993, ,, Postglacial, vegetation, and, climate, of, Grand, Teton, and, southern, Yellowstone, National, Parks, :, Ecological, Monographs, ,, v., 63, ,, p., 173Ð198, .]"
40110,450,"[Whitlock, ,, C., ,, and, Bartlein, ,, P., J., ,, 1993, ,, Spatial, variation, of, Holocene, climatic, change, in, the, Yellowstone, region, :, Quaternary, Research, ,, v., 39, ,, p., 231Ð238, .]"
40111,451,"[Wright, ,, H., E., ,, Jr., ,, 1983, ,, Late-Quaternary, environments, of, the, United, States, ,, Volume, 2, :, The, Holocene, :, Minneapolis, ,, University, of, Minnesota, Press, .]"


In [15]:
# Print the exact line where locations are.
pd.DataFrame(order_article('57928e07cf58f133d1c26609').iloc[179])

Unnamed: 0,39801
sentid,180
words,"[LOCATION, AND, AGE, OF, FOSSIL, STUDY, SITES, Core, Location, Lat, Long, -LRB-, °, N, -RRB-, -LRB-, °, W, -RRB-, Elevation, Depth, -LRB-, m, -RRB-, -LRB-, cm, -RRB-, Age, Laboratory, -LRB-, yr, B.P., -RRB-, number, Cottonwood, III, 38, °, 49, ′, 50, ′, ′, 106, °, 24, ′, 45, ′, ′, 3670, 40, --, ..."


In [43]:
other_example = nlp_sentences[nlp_sentences['_gddid'] == '57928e07cf58f133d1c26609']
other_example = other_example[other_example['sentid'] == 180]
other_example[['words', 'liststring', 'dms_regex']]

Unnamed: 0,words,liststring,dms_regex
39801,"[LOCATION, AND, AGE, OF, FOSSIL, STUDY, SITES, Core, Location, Lat, Long, -LRB-, °, N, -RRB-, -LRB-, °, W, -RRB-, Elevation, Depth, -LRB-, m, -RRB-, -LRB-, cm, -RRB-, Age, Laboratory, -LRB-, yr, B.P., -RRB-, number, Cottonwood, III, 38, °, 49, ′, 50, ′, ′, 106, °, 24, ′, 45, ′, ′, 3670, 40, --, 50, 80, --, 90, 120, --, 130, 2460, ±, 130, 5790, ±, 180, 8230, ±, 200, A-4481, A-4482, A-4483, Cottonwood, IV, 38, °, 49, ′, 50, ′, ′, 106, °, 24, ′, 45, ′, ′, 3670, 60, --, 80, 5610, ±, 150, A-4487, 148, --, 168, 8790, ...]","LOCATION,AND,AGE,OF,FOSSIL,STUDY,SITES,Core,Location,Lat,Long,-LRB-,°,N,-RRB-,-LRB-,°,W,-RRB-,Elevation,Depth,-LRB-,m,-RRB-,-LRB-,cm,-RRB-,Age,Laboratory,-LRB-,yr,B.P.,-RRB-,number,Cottonwood,III,38,°,49,′,50,′,′,106,°,24,′,45,′,′,3670,40,--,50,80,--,90,120,--,130,2460,±,130,5790,±,180,8230,±,200,A-4481,A-4482,A-4483,Cottonwood,IV,38,°,49,′,50,′,′,106,°,24,′,45,′,′,3670,60,--,80,5610,±,150,A-4487,148,--,168,8790,±,230,A-4488,Red,Lady,I,38,°,52,′,50,′,′,107,°,2,′,30,′,′,3350,75,--,95,4675,±,155,GX-8350,Red,Lady,II,38,°,52,′,50,′,′,107,°,2,′,30,′,′,3350,10,--,20,45,--,55,74,--,84,100,±,70,2520,±,90,4680,±,120,A-4484,A-4485,A-4486,Red,Well,38,°,53,′,40,′,′,107,°,3,′,15,′,′,3290,80,--,100,2805,±,160,GX-8351,Copley,II,38,°,52,′,28,′,′,107,°,05,′,3250,25,--,35,62,--,72,102,--,112,125,--,135,155,--,165,181,--,191,212,--,222,1570,±,150,5690,±,150,6900,±,220,8210,±,355,10325,±,465,12810,±,750,15370,±,880,A-4479,A-4480,GX-7906,GX-12995,GX-12996,GX-12997,GX-7907,Splains,II,38,°,50,′,107,°,4,′,30,′,′,3160,45,--,55,90,--,100,190,--,200,266,--,286,291,--,292,3880,±,170,5040,±,140,8750,±,220,11040,±,240,12020,±,530,A-4408,A-4407,A-4409,A-2535,AA-642,Splains,Gulch,I,38,°,50,′,107,°,4,′,30,′,′,3150,34,--,48,80,--,98,134,--,148,190,--,200,4440,±,120,5620,±,140,7340,±,240,8560,±,600,A-4410,A-2534,A-4411,A-2424,Ironbog,38,°,52,′,107,°,2,′,30,′,′,2920,48,--,50,58,--,66,134,--,136,170,195,--,200,218,--,220,2640,±,90,3105,±,155,6300,±,80,7700,±,70,7100,±,255,8260,±,80,AA-1125,GX-7905,AA-1126,AA-1127,GX-7339,AA-1128,Alkali,Basin,I,38,°,45,′,106,°,50,′,2750,137,3480,±,135,GX-6796,300,7857,±,260,GX-6795,410,12975,±,430,GX-6794,Alkali,Basin,II,38,°,45,′,106,°,50,′,2750,533,--,549,17440,±,1200,A-2421,conifers,,,Picea,and,Abies,,,drop,to,levels,found,above,treeline,in,the,krummholz,zone,today,,,and,Pinus,pollen,percentages,and,accumulation,increase,-LRB-,see,Figs.,6A,and,7A,-RRB-,.","[38,°,49,′,50,′,′,106,°,24,′,45,′,′,, 38,°,49,′,50,′,′,106,°,24,′,45,′,′,, 38,°,52,′,50,′,′,107,°,2,′,30,′,′,, 38,°,52,′,50,′,′,107,°,2,′,30,′,′,, 38,°,53,′,40,′,′,107,°,3,′,15,′,′,, 38,°,52,′,28,′,′,107,°,05,′,, 38,°,50,′,107,°,4,′,30,′,′,, 38,°,50,′,107,°,4,′,30,′,′,, 38,°,52,′,107,°,2,′,30,′,′,, 38,°,45,′,106,°,50,′,, 38,°,45,′,106,°,50,′,]"


Skimmed info:  
* `Link`: http://dx.doi.org/10.1130/0016-7606(1997)109<1306:tfalqp>2.3.co
* `Site Name`:  Cottonwood III, Cottonwood IV, Red Lady, Red Well, Splains
* `Location`:  38°49'50' 106°24'45', 38°49'50' 106°24'45', 38°52'50' 107°2'30', 38°52'50' 107°2'30',38°53'40' 107°3'15', 38°52'28' 107°05', 38°50' 107°4'30', 38°50' 107°4'30', 38°52' 107°2'30', 38°45' 106°50', 38°45' 106°50'
* `Age Span`:   17000 - 400
* `Site Descriptions`:   The lower timberline or lower forest border is the lower elevational limit of a continuous forest belt. In the Rocky Mountains it forms a sharp contrast with shrublands or grasslands in valley bottoms. Trees may extend below the lower timberline in favorable habitats, along streams or on rocky outcrops.  

In [17]:
# Find the article in the bibliography by gddid to retrieve Article Name

bibliography[bibliography['_gddid'] == '57928e07cf58f133d1c26609']

Unnamed: 0,_type,_id,publisher,title,journal.name.name,author,year,number,volumne,link,_gddid,type,pages,url
93,doi,10.1130/0016-7606(1997)109<1306:tfalqp>2.3.co;2,GSA,"Timberline fluctuations and late Quaternary paleoclimates in the Southern Rocky Mountains, Colorado",Geological Society of America Bulletin,"[{'name': 'Fall, Patricia L.'}]",1997,10,,"[{'url': 'http://dx.doi.org/10.1130/0016-7606(1997)109<1306:tfalqp>2.3.co;2', 'type': 'publisher'}]",57928e07cf58f133d1c26609,article,1306--1320,


### Finding Latitude and Longitud via REGEX 

After reviewing some PDF's and viewing the data parsing, I noticed some possible patterns for coordinates.

<img src="figures/img/00_snippet_of_pdf.png" alt="Drawing" style="width: 300px;"/>

Which translates to our pandas dataframe as:

![alt text](figures/img/01_snippet_in_sql.png "PandasVersion")

As a first problem, we can notice that the degrees symbol `°` was translated as a `3`. On some other occassions, it is confused as an `o` or a middle point `◦`. For `'`, this symbol got confused by the word `cents`.

I am still going to use normal REGEX to find coordinates in the most intuitive way. 

A coordinate for longitude is a number ranging from -180° to 180°. It has three components, `°, ',"` and it covers the position in E or W.

Latitude goes from -90° to 90°, has the same three components and goes from S to N.

Using REGEX, and assuming data consistency, the following code should be able to extract coordinates. 

In [38]:
dms_regex = r"([-]?\d{1,3}[°|,|′|\']{0,3}\d{1,2}[,|′|\']{0,3}\d{0,2}[,|′|\']{0,4}[NESWnesw]?[\s|,|\']+?[-]?\d{1,3}[°|,|′|\']+\d{1,2}[,|′|\']+\d{0,2}[,|′|\'][,|′|\']{0,4}[NESWnesw]?)"

dd_regex = "[\\{,][-]?[1]?[0-9]{1,2}\\.[0-9]{1,}[,]?[NESWnesw],"

In [39]:
# TODO Find REGEX of couples of numbers (123, 234)
nlp_sentences['liststring'] = nlp_sentences['words'].apply(lambda x: ','.join(map(str, x)))
nlp_sentences['dms_regex'] = nlp_sentences['liststring'].str.findall(dms_regex)


#nlp_sentences[['dms_regex', 'words']]

nlp_sentences[['liststring', 'words', 'dms_regex']]

Unnamed: 0,liststring,words,dms_regex
0,"Available,online,at,www.sciencedirect.com,Quaternary,Research,69,-LRB-,2008,-RRB-,263,--,275,www.elsevier.com/locate/yqres,Development,of,the,mixed,conifer,forest,in,northern,New,Mexico,and,its,relationship,to,Holocene,environmental,change,R.,Scott,Anderson,a,,,b,,,,,Renata,B.,Jass,b,,1,,,Jaime,...","[Available, online, at, www.sciencedirect.com, Quaternary, Research, 69, -LRB-, 2008, -RRB-, 263, --, 275, www.elsevier.com/locate/yqres, Development, of, the, mixed, conifer, forest, in, northern, New, Mexico, and, its, relationship, to, Holocene, environmental, change, R., Scott, Anderson, a, ...",[]
1,"The,Chihuahueños,Bog,record,extends,to,over,15,000,cal,yr,BP,.","[The, Chihuahueños, Bog, record, extends, to, over, 15,000, cal, yr, BP, .]",[]
2,"An,Artemisia,steppe,,,then,an,open,Picea,woodland,grew,around,a,small,pond,until,ca.,11,700,cal,yr,BP,when,Pinus,ponderosa,became,established,.","[An, Artemisia, steppe, ,, then, an, open, Picea, woodland, grew, around, a, small, pond, until, ca., 11,700, cal, yr, BP, when, Pinus, ponderosa, became, established, .]",[]
3,"C/N,ratios,,,δ13C,and,δ15N,values,indicate,both,terrestrial,and,aquatic,organic,matter,was,incorporated,into,the,sediment,.","[C/N, ratios, ,, δ13C, and, δ15N, values, indicate, both, terrestrial, and, aquatic, organic, matter, was, incorporated, into, the, sediment, .]",[]
4,"Higher,percentages,of,aquatic,algae,and,elevated,C/N,ratios,indicate,higher,lake,levels,at,the,opening,of,the,Holocene,,,but,a,wetland,developed,subsequently,as,climate,warmed,.","[Higher, percentages, of, aquatic, algae, and, elevated, C/N, ratios, indicate, higher, lake, levels, at, the, opening, of, the, Holocene, ,, but, a, wetland, developed, subsequently, as, climate, warmed, .]",[]
...,...,...,...
111960,"Stomatal,numbers,are,sensitive,to,increases,in,CO2,from,pre-industrial,levels,.","[Stomatal, numbers, are, sensitive, to, increases, in, CO2, from, pre-industrial, levels, .]",[]
111961,"Nature,327,:,617,--,618,.","[Nature, 327, :, 617, --, 618, .]",[]
111962,"Copyright,ß,2009,John,Wiley,&,Sons,,,Ltd.,.","[Copyright, ß, 2009, John, Wiley, &, Sons, ,, Ltd., .]",[]
111963,"J.,Quaternary,Sci.,,,Vol,.","[J., Quaternary, Sci., ,, Vol, .]",[]


In [40]:
italy_example = nlp_sentences[nlp_sentences['_gddid'] == '550453fde1382326932d85f7']
italy_example = italy_example[italy_example['sentid'] == 10]
italy_example[['words', 'liststring', 'dms_regex']]

Unnamed: 0,words,liststring,dms_regex
94875,"[Introduction, Laghi, di, Monticchio, are, two, maar, lakes, located, in, the, explosion, crater, that, lies, west, of, the, peak, of, Monte, Vulture, ,, near, Mel, '', in, the, Basilicata, region, of, southern, Italy, -LRB-, 403, 56, 40, N, ,, 153, 36, 48, E, -RRB-, -LRB-, Fig., 1, -RRB-, .]","Introduction,Laghi,di,Monticchio,are,two,maar,lakes,located,in,the,explosion,crater,that,lies,west,of,the,peak,of,Monte,Vulture,,,near,Mel,'',in,the,Basilicata,region,of,southern,Italy,-LRB-,403,56,40,N,,,153,36,48,E,-RRB-,-LRB-,Fig.,1,-RRB-,.","[403,56,40,N,,,153,36,48,E]"


In [275]:
italy_example['words'].astype(str).str.len()

94875    391
Name: words, dtype: int64

### Problems with the REGEX approach

As seen before, extracting information using REGEX is very complicated. We would require a lot of queries that would retrieve each particular case.

A possible alternative could be using NER, Named Entity Recognition.

## Linking Neotoma DB to extract locations

In [276]:
neotoma = pd.read_csv("../Do_not_commit_data/data-1590729612420.csv")
neotoma = neotoma[['siteid', 'sitename', 'longitudeeast', 'latitudenorth', 'longitudewest', 'latitudesouth', 'sitedescription', 'doi']]
#neotoma.drop_duplicates(subset ="sitename", keep = False, inplace = True) 
neotoma.head(3)

Unnamed: 0,siteid,sitename,longitudeeast,latitudenorth,longitudewest,latitudesouth,sitedescription,doi
0,10330,Lac du Sommet,-70.66468,47.71662,-70.66573,47.71382,"The small shallow Lac du Sommet (0.02 km2, 4 m maximum depth, elevation of 830 m a.s.l., 47°43′N, 70°40′W) is located in the boreal forest north of the St Lawrence Estuary (Figure 1). It is of glacial origin and situated on granitic-gneissic bedrock of the Canadian Precambrian Shield in the Laurentian Mountains.",10.1177/0959683611400199
1,10330,Lac du Sommet,-70.66468,47.71662,-70.66573,47.71382,"The small shallow Lac du Sommet (0.02 km2, 4 m maximum depth, elevation of 830 m a.s.l., 47°43′N, 70°40′W) is located in the boreal forest north of the St Lawrence Estuary (Figure 1). It is of glacial origin and situated on granitic-gneissic bedrock of the Canadian Precambrian Shield in the Laurentian Mountains.",10.1177/0959683611400199
2,1729,Myrtle Lake,-93.37853,47.98645,-93.39207,47.97876,"Lake surrounded by peatland. Physiography: Red Lake lowlands. Surrounding vegetation: Sphagnum, Piceto-Chamaedophnetum.",10.1139/b68-190


In [285]:
nlp_sentences.join(bibliography, on = '_gddid')

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [286]:
dataTypeDict = dict(nlp_sentences.dtypes)

In [287]:
dataTypeDict

{'_gddid': dtype('O'),
 'sentid': dtype('int64'),
 'wordidx': dtype('O'),
 'words': dtype('O'),
 'part_of_speech': dtype('O'),
 'special_class': dtype('O'),
 'lemmas': dtype('O'),
 'word_type': dtype('O'),
 'word_modified': dtype('O')}

In [288]:
dataTypeDict2 = dict(bibliography.dtypes)
dataTypeDict2

{'_type': dtype('O'),
 '_id': dtype('O'),
 'publisher': dtype('O'),
 'title': dtype('O'),
 'journal.name.name': dtype('O'),
 'author': dtype('O'),
 'year': dtype('O'),
 'number': dtype('O'),
 'volumne': dtype('O'),
 'link': dtype('O'),
 '_gddid': dtype('O'),
 'type': dtype('O'),
 'pages': dtype('O'),
 'url': dtype('float64')}