In [1]:
import pandas as pd
import numpy as np
from src.logs import get_logger
import src.article_relevance as ar

  from .autonotebook import tqdm as notebook_tqdm


# Notebook to Prepare 3 Training PQ Files

## Fixed Neotoma & Pollen files from CrossRef

In [2]:
neotoma_fixed = pd.read_csv('data/raw/neotoma_crossref_fixed.csv')
pollenDF = pd.read_csv('data/raw/pollen_doc_labels.csv')

Extracting DOIs and extracing data from CrossRef

In [3]:
neotomaCrossRef = ar.crossRefQuery(neotoma_fixed['doi'].unique().tolist())
pollenCrossRef =  ar.crossRefQuery(pollenDF['doi'].unique().tolist())



Concatenate the two dataframes.

In [4]:
df = pd.concat([neotomaCrossRef, pollenCrossRef])
df = df.reset_index(drop=True)

In [5]:
df.head(2)

Unnamed: 0,DOI,title,subtitle,author,subject,abstract,container-title,language,published,publisher,URL,CrossRefQueryDate
0,10.1139/e98-095,[Age verification of the Lake Gribben forest b...,[],"[{'given': 'Thomas V', 'family': 'Lowell', 'se...",[General Earth and Planetary Sciences],<jats:p> Analysis of nine wood samples from th...,[Canadian Journal of Earth Sciences],en,"{'date-parts': [[1999, 3, 25]]}",Canadian Science Publishing,http://dx.doi.org/10.1139/e98-095,2023-09-11 13:14:41.709430
1,10.1111/j.1502-3885.2000.tb01209.x,"[Postglacial climate and vegetation history, n...",[],"[{'given': 'JEFFREY A.', 'family': 'SNYDER', '...","[Geology, Archeology, Ecology, Evolution, Beha...",,[Boreas],en,"{'date-parts': [[2008, 6, 28]]}",Wiley,http://dx.doi.org/10.1111/j.1502-3885.2000.tb0...,2023-09-11 13:14:42.224392


In [6]:
#df.to_csv('data/raw/data_full.csv',  index=False)

In [7]:
#df = pd.read_csv('data/raw/data_full.csv')
#df.head(2)

In [8]:
preprocessedDF = ar.dataPreprocessing(df)

2023-09-11 13:22:54,365 - dataPreprocessing.py:22 - dataPreprocessing - INFO - Prediction data preprocessing begin.


dup subjects
Index([], dtype='object')
dup journals
Index([], dtype='object')
2023-09-11 13:22:54,798 - dataPreprocessing.py:66 - dataPreprocessing - INFO - Running article language imputation.
2023-09-11 13:22:54,806 - dataPreprocessing.py:82 - dataPreprocessing - INFO - 151 articles require language imputation
2023-09-11 13:22:54,807 - dataPreprocessing.py:84 - dataPreprocessing - INFO - 5 cannot be imputed due to too short text metadata(title, subtitle and abstract less than 5 character).
2023-09-11 13:22:56,108 - dataPreprocessing.py:92 - dataPreprocessing - INFO - Missing language imputation completed
2023-09-11 13:22:56,110 - dataPreprocessing.py:93 - dataPreprocessing - INFO - After imputation, there are 14 non-English articles in total excluded from the prediction pipeline.
dup cols
Index(['Ecology', 'Geology', 'Global and Planetary Change', 'Plant Science'], dtype='object')


In [11]:
preprocessedDF.head(2)

Unnamed: 0,DOI,title,subtitle,author,abstract,language,published,publisher,URL,CrossRefQueryDate,...,World Archaeology,ZooKeys,Zootaxa,eBioMedicine,eLife,iScience,Écologie et paléoécologie végétale,Écoscience,validForPrediction,titleSubtitleAbstract
0,10.1016/s0034-6667(02)00249-x,Late-Glacial and Holocene forest dynamics at S...,,"[{'given': 'Leif', 'family': 'Björkman', 'sequ...",,en,"{'date-parts': [[2003, 4]]}",Elsevier BV,http://dx.doi.org/10.1016/s0034-6667(02)00249-x,2023-09-11 12:56:35.106164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,late-glacial and holocene forest dynamics at s...
1,10.1177/0959683615580201,A multi-proxy peat study of Holocene vegetatio...,,"[{'given': 'Terri', 'family': 'Lacourse', 'seq...",We present a multi-proxy paleoenvironmental s...,en,"{'date-parts': [[2015, 4, 16]]}",SAGE Publications,http://dx.doi.org/10.1177/0959683615580201,2023-09-11 12:56:35.466092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,a multi-proxy peat study of holocene vegetatio...


Save this as 1 parquet file.

In [9]:
preprocessedDF.columns[preprocessedDF.columns.duplicated()]


Index([], dtype='object')

In [10]:
preprocessedDF.to_parquet('data/parquet/neotomaMetadata.parquet', index=False)

## Annotation Data File

We need a 2nd file that contains the following info:
```python
['DOI', 'Annotation', 'Annotator', 'Annotation_Date']
```

In the `pollen.csv` file Dr. Goring has done already some manual annotation of whether a particular article would belong to Neotoma or not. All files in the `neotoma.csv` file belong to Neotoma. 

In a different setting, this annotations would be made using the **Data Review Tool** and stored to complement the parquet file, for now, I am going to do a JSON

In [None]:
from datetime import datetime
annotation_cols = ['DOI', 'annotation', 'annotator', 'annotationDate']

In [None]:
neotoma_fixed['DOI'] = neotoma_fixed['doi']
neotoma_fixed['annotation'] = 'Neotoma'
neotoma_fixed['annotator'] = 'Simon J. Goring'
neotoma_fixed['annotationDate'] = datetime.now()
neotoma_fixed['annotationDate'] = neotoma_fixed['annotationDate'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
neotomaAnnotation = neotoma_fixed[annotation_cols]

In [None]:
pollenDF['DOI'] = pollenDF['doi']
pollenDF['annotation'] = pollenDF['Label']
pollenDF['annotator'] = pollenDF['Profile']
pollenDF['annotationDate'] = pollenDF['Timestamp']
#pollenDF['annotationDate'] = pollenDF['annotationDate'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
pollenAnnotation = pollenDF[annotation_cols]

In [None]:
fullAnnotation = pd.concat([neotomaAnnotation, pollenAnnotation])
fullAnnotation = fullAnnotation.reset_index()

In [None]:
fullAnnotation.to_parquet('data/parquet/neotomaAnnotation.parquet', index=False)

## Prediction Parquet

The Prediction PQ file will contain the following columns:
```python
['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']
```

Embed the data using the provided functions (this has been previously trained).

In [None]:
embeddedData = ar.addEmbeddings(preprocessedDF, 'titleSubtitleAbstract')

2023-09-01 11:55:24,625 - addEmbeddings.py:19 - addEmbeddings - INFO - Sentence embedding start.


No sentence-transformers model found with name /Users/sedv8808/.cache/torch/sentence_transformers/allenai_specter2. Creating a new one with MEAN pooling.


2023-09-01 12:04:33,627 - addEmbeddings.py:37 - addEmbeddings - INFO - Sentence embedding completed.


Do the predictions using the AWS model

In [None]:
embeddedData.loc[:, 'is-referenced-by-count'] = 0
predictionsDF = ar.relevancePredict(embeddedData, AWS = True)

2023-09-01 12:04:33,717 - relevancePredict.py:27 - relevancePredict - INFO - Prediction start.
2023-09-01 12:04:34,813 - relevancePredict.py:52 - relevancePredict - INFO - Running prediction for 1223 articles.
2023-09-01 12:04:34,821 - relevancePredict.py:59 - relevancePredict - INFO - 0 articles's input feature contains NaN value.
2023-09-01 12:04:34,894 - relevancePredict.py:90 - relevancePredict - INFO - Prediction completed.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validDF['has_abstract'] = 1
A value is trying to be set on a copy of a slice from a DataFra

In [None]:
embeddedData

Unnamed: 0,DOI,title,subtitle,author,subject,abstract,container-title,language,published,publisher,...,759,760,761,762,763,764,765,766,767,is-referenced-by-count
0,10.1073/pnas.1222239110,Holocene dynamics of the Florida Everglades wi...,,"[{'given': 'Paul H.', 'family': 'Glaser', 'seq...",Multidisciplinary,Significance Wind-blown dust is seldom consi...,Proceedings of the National Academy of Sciences,en,"{'date-parts': [[2013, 10, 7]]}",Proceedings of the National Academy of Sciences,...,0.107798,0.280265,-0.515223,-0.225157,-1.065129,-0.047976,0.022255,-0.341612,-0.460769,0
1,10.1139/e98-095,Age verification of the Lake Gribben forest be...,,"[{'given': 'Thomas V', 'family': 'Lowell', 'se...",General Earth and Planetary Sciences,Analysis of nine wood samples from the Lake ...,Canadian Journal of Earth Sciences,en,"{'date-parts': [[1999, 3, 25]]}",Canadian Science Publishing,...,-0.165364,-0.144835,-1.041570,-0.354374,-0.547732,-0.443002,0.493709,-0.050640,-0.369954,0
2,10.1046/j.1365-2745.2002.00697.x,Pinus sylvestris treeline development and move...,,"[{'given': 'Bruce R.', 'family': 'Gervais', 's...","Plant Science Ecology Ecology, Evolution, Beha...",,Journal of Ecology,en,"{'date-parts': [[2002, 8]]}",Wiley,...,0.422835,0.234868,-0.711326,-0.488857,-0.911259,-0.679060,0.344870,-0.082448,-0.886954,0
3,10.1016/j.quascirev.2015.03.014,The origin of grasslands in the temperate fore...,,[{'ORCID': 'http://orcid.org/0000-0001-9605-82...,"Geology Archeology Archeology Ecology, Evoluti...",,Quaternary Science Reviews,en,"{'date-parts': [[2015, 5]]}",Elsevier BV,...,0.434599,0.045860,-1.367095,-0.615886,-0.737985,-0.387266,-0.189250,-0.259587,-0.474775,0
4,10.1177/0959683608100575,Holocene climate variability and vegetation dy...,,"[{'given': 'Holger', 'family': 'Niemann', 'seq...",Paleontology Earth-Surface Processes Ecology A...,"Palaeoenvironmental changes, inferred from a ...",The Holocene,en,"{'date-parts': [[2009, 3]]}",SAGE Publications,...,-0.367447,0.426210,-1.023674,-0.030793,-1.377967,-0.299277,0.092376,0.181580,-0.699453,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
709,10.1139/b86-262,History of late- and post-glacial vegetation a...,,"[{'given': 'R. Scott', 'family': 'Anderson', '...",Plant Science,The changing character of vegetation and the...,Canadian Journal of Botany,fr,"{'date-parts': [[1986, 9, 1]]}",Canadian Science Publishing,...,,,,,,,,,,0
721,10.1023/a:1021675225099,,,"[{'given': 'Robert K.', 'family': 'Booth', 'se...",Earth-Surface Processes Aquatic Science,,Journal of Paleolimnology,,{'date-parts': [[2002]]},Springer Science and Business Media LLC,...,,,,,,,,,,0
799,10.29262/ram.v69i1.962,Peculiaridades de la alergia a los alimentos v...,,[{'ORCID': 'http://orcid.org/0000-0002-0398-95...,Immunology and Allergy,La alergia a frutas y verduras es la alergia ...,Revista Alergia México,es,"{'date-parts': [[2023, 1, 31]]}",Colegio Mexicano de Inmunologia Clinica y Aler...,...,,,,,,,,,,0
1086,10.1007/s15006-023-2533-8,Fast kein Tag mehr ohne Pollen,,"[{'given': 'Nicola', 'family': 'Zink', 'sequen...",General Medicine,,MMW - Fortschritte der Medizin,de,"{'date-parts': [[2023, 4]]}",Springer Science and Business Media LLC,...,,,,,,,,,,0


In [None]:
predictionsDF.head()

Unnamed: 0,index,title,subtitle,abstract,DOI,URL,validForPrediction,predict_proba,prediction,author,...,760,761,762,763,764,765,766,767,is-referenced-by-count,model_metadata
0,0,Holocene dynamics of the Florida Everglades wi...,,Significance Wind-blown dust is seldom consi...,10.1073/pnas.1222239110,http://dx.doi.org/10.1073/pnas.1222239110,1,0.964179,1.0,"[{'given': 'Paul H.', 'family': 'Glaser', 'seq...",...,,,,,,,,,,
1,1,Age verification of the Lake Gribben forest be...,,Analysis of nine wood samples from the Lake ...,10.1139/e98-095,http://dx.doi.org/10.1139/e98-095,1,0.993378,1.0,"[{'given': 'Thomas V', 'family': 'Lowell', 'se...",...,,,,,,,,,,
2,2,Pinus sylvestris treeline development and move...,,,10.1046/j.1365-2745.2002.00697.x,http://dx.doi.org/10.1046/j.1365-2745.2002.006...,1,0.858934,1.0,"[{'given': 'Bruce R.', 'family': 'Gervais', 's...",...,,,,,,,,,,
3,3,The origin of grasslands in the temperate fore...,,,10.1016/j.quascirev.2015.03.014,http://dx.doi.org/10.1016/j.quascirev.2015.03.014,1,0.994306,1.0,[{'ORCID': 'http://orcid.org/0000-0001-9605-82...,...,,,,,,,,,,
4,4,Holocene climate variability and vegetation dy...,,"Palaeoenvironmental changes, inferred from a ...",10.1177/0959683608100575,http://dx.doi.org/10.1177/0959683608100575,1,0.9986,1.0,"[{'given': 'Holger', 'family': 'Niemann', 'seq...",...,,,,,,,,,,


In [None]:
predictionsDF['prediction_date'] = datetime.now()
predictionsDF['prediction_date'] = predictionsDF['prediction_date'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
predictionsDF['DOI'] = predictionsDF['DOI'].apply(str)

Save the columns we are interested on.

In [None]:
predictionCols = ['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']

In [None]:
predictionsDF

Unnamed: 0,index,title,subtitle,abstract,DOI,URL,validForPrediction,predict_proba,prediction,author,...,760,761,762,763,764,765,766,767,is-referenced-by-count,model_metadata
0,0,Holocene dynamics of the Florida Everglades wi...,,Significance Wind-blown dust is seldom consi...,10.1073/pnas.1222239110,http://dx.doi.org/10.1073/pnas.1222239110,1,0.964179,1.0,"[{'given': 'Paul H.', 'family': 'Glaser', 'seq...",...,,,,,,,,,,
1,1,Age verification of the Lake Gribben forest be...,,Analysis of nine wood samples from the Lake ...,10.1139/e98-095,http://dx.doi.org/10.1139/e98-095,1,0.993378,1.0,"[{'given': 'Thomas V', 'family': 'Lowell', 'se...",...,,,,,,,,,,
2,2,Pinus sylvestris treeline development and move...,,,10.1046/j.1365-2745.2002.00697.x,http://dx.doi.org/10.1046/j.1365-2745.2002.006...,1,0.858934,1.0,"[{'given': 'Bruce R.', 'family': 'Gervais', 's...",...,,,,,,,,,,
3,3,The origin of grasslands in the temperate fore...,,,10.1016/j.quascirev.2015.03.014,http://dx.doi.org/10.1016/j.quascirev.2015.03.014,1,0.994306,1.0,[{'ORCID': 'http://orcid.org/0000-0001-9605-82...,...,,,,,,,,,,
4,4,Holocene climate variability and vegetation dy...,,"Palaeoenvironmental changes, inferred from a ...",10.1177/0959683608100575,http://dx.doi.org/10.1177/0959683608100575,1,0.998600,1.0,"[{'given': 'Holger', 'family': 'Niemann', 'seq...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1232,709,History of late- and post-glacial vegetation a...,,The changing character of vegetation and the...,10.1139/b86-262,http://dx.doi.org/10.1139/b86-262,0,,,"[{'given': 'R. Scott', 'family': 'Anderson', '...",...,,,,,,,,,0.0,
1233,721,,,,10.1023/a:1021675225099,http://dx.doi.org/10.1023/a:1021675225099,0,,,"[{'given': 'Robert K.', 'family': 'Booth', 'se...",...,,,,,,,,,0.0,
1234,799,Peculiaridades de la alergia a los alimentos v...,,La alergia a frutas y verduras es la alergia ...,10.29262/ram.v69i1.962,http://dx.doi.org/10.29262/ram.v69i1.962,0,,,[{'ORCID': 'http://orcid.org/0000-0002-0398-95...,...,,,,,,,,,0.0,
1235,1086,Fast kein Tag mehr ohne Pollen,,,10.1007/s15006-023-2533-8,http://dx.doi.org/10.1007/s15006-023-2533-8,0,,,"[{'given': 'Nicola', 'family': 'Zink', 'sequen...",...,,,,,,,,,0.0,


In [None]:
predictionsDF = predictionsDF[predictionCols]

In [None]:
predictionsDF['prediction'].value_counts()

prediction
1.0    771
0.0    452
Name: count, dtype: int64

In [None]:
predictionsDF = predictionsDF[(predictionsDF['prediction'] == 1) |(predictionsDF['prediction'] == 0) ]

In [None]:
predictionsDF.to_parquet('data/parquet/neotomaPredictions.parquet', index=False)