In [1]:
import pandas as pd
import numpy as np
from src.logs import get_logger
import src.article_relevance as ar

  from .autonotebook import tqdm as notebook_tqdm


# Notebook to Prepare 3 Training PQ Files

## Fixed Neotoma & Pollen files from CrossRef

In [2]:
neotoma_fixed = pd.read_csv('data/raw/neotoma_crossref_fixed.csv')
pollenDF = pd.read_csv('data/raw/pollen_doc_labels.csv')

In [3]:
neotoma_fixed.columns

Index(['siteid', 'sitename', 'longitudeeast', 'latitudenorth', 'longitudewest',
       'latitudesouth', 'altitude', 'area', 'sitedescription', 'notes',
       'recdatecreated', 'recdatemodified', 'geog', 'datasetid',
       'collectionunitid', 'datasettypeid', 'datasetname', 'notes-2',
       'recdatecreated-2', 'recdatemodified-2', 'embargoid', 'citation', 'doi',
       'status_code', 'doi_in_crossref'],
      dtype='object')

Extracting DOIs and extracing data from CrossRef

In [4]:
neotomaCrossRef = ar.crossRefQuery(neotoma_fixed['doi'].unique().tolist())
pollenCrossRef =  ar.crossRefQuery(pollenDF['doi'].unique().tolist())



Concatenate the two dataframes.

In [5]:
df = pd.concat([neotomaCrossRef, pollenCrossRef])
df = df.reset_index(drop=True)

In [6]:
df.head(2)

Unnamed: 0,DOI,title,subtitle,author,subject,abstract,container-title,language,published,publisher,URL,CrossRefQueryDate
0,10.1111/j.1438-8677.1972.tb03139.x,[Remarks on the Late‐glacial and Holocene Hist...,[],"[{'given': 'Magdalena', 'family': 'Ralska‐Jasi...","[Plant Science, Ecology, Evolution, Behavior a...",,[Berichte der Deutschen Botanischen Gesellschaft],en,"{'date-parts': [[1972, 5]]}",Wiley,http://dx.doi.org/10.1111/j.1438-8677.1972.tb0...,2023-08-30 16:49:23.814159
1,10.2307/1441188,[A New Species of Geochelone from the Pleistoc...,[],"[{'given': 'Walter', 'family': 'Auffenberg', '...","[Animal Science and Zoology, Aquatic Science, ...",,[Copeia],,"{'date-parts': [[1962, 9, 28]]}",JSTOR,http://dx.doi.org/10.2307/1441188,2023-08-30 16:49:24.198599


In [7]:
preprocessedDF = ar.dataPreprocessing(df)

2023-08-30 16:57:28,206 - dataPreprocessing.py:21 - dataPreprocessing - INFO - Prediction data preprocessing begin.
2023-08-30 16:57:28,265 - dataPreprocessing.py:46 - dataPreprocessing - INFO - Running article language imputation.


2023-08-30 16:57:28,280 - dataPreprocessing.py:62 - dataPreprocessing - INFO - 151 articles require language imputation
2023-08-30 16:57:28,281 - dataPreprocessing.py:64 - dataPreprocessing - INFO - 5 cannot be imputed due to too short text metadata(title, subtitle and abstract less than 5 character).
2023-08-30 16:57:29,938 - dataPreprocessing.py:72 - dataPreprocessing - INFO - Missing language imputation completed
2023-08-30 16:57:29,941 - dataPreprocessing.py:73 - dataPreprocessing - INFO - After imputation, there are 14 non-English articles in total excluded from the prediction pipeline.


In [8]:
preprocessedDF.head(2)

Unnamed: 0,DOI,title,subtitle,author,subject,abstract,container-title,language,published,publisher,URL,CrossRefQueryDate,validForPrediction,titleSubtitleAbstract
0,10.1111/j.1438-8677.1972.tb03139.x,Remarks on the Late‐glacial and Holocene Histo...,,"[[{'given': 'Magdalena', 'family': 'Ralska‐Jas...","Plant Science Ecology, Evolution, Behavior and...",,Berichte der Deutschen Botanischen Gesellschaft,en,"{'date-parts': [[1972, 5]]}",Wiley,http://dx.doi.org/10.1111/j.1438-8677.1972.tb0...,2023-08-30 16:49:23.814159,1,remarks on the late‐glacial and holocene histo...
1,10.2307/1441188,A New Species of Geochelone from the Pleistoce...,,"[[{'given': 'Walter', 'family': 'Auffenberg', ...",Animal Science and Zoology Aquatic Science Eco...,,Copeia,en,"{'date-parts': [[1962, 9, 28]]}",JSTOR,http://dx.doi.org/10.2307/1441188,2023-08-30 16:49:24.198599,1,a new species of geochelone from the pleistoce...


Save this as 1 parquet file.

In [36]:
preprocessedDF.to_parquet('data/parquet/neotomaMetadata.parquet', index=False)

ArrowInvalid: ('cannot mix list and non-list, non-null values', 'Conversion failed for column author with type object')

## Annotation Data File

We need a 2nd file that contains the following info:
```python
['DOI', 'Annotation', 'Annotator', 'Annotation_Date']
```

In the `pollen.csv` file Dr. Goring has done already some manual annotation of whether a particular article would belong to Neotoma or not. All files in the `neotoma.csv` file belong to Neotoma. 

In a different setting, this annotations would be made using the **Data Review Tool** and stored to complement the parquet file, for now, I am going to do a JSON

In [9]:
from datetime import datetime
annotation_cols = ['DOI', 'annotation', 'Annotator', 'Annotation_Date']

In [None]:
neotoma_fixed['annotation'] = 'Neotoma'
neotoma_fixed['annotator'] = 'Simon J. Goring'
neotoma_fixed['annotationDate'] = datetime.now()
neotomaAnnotation = neotoma_fixed[annotation_cols]

In [None]:
pollenDF['annotation'] = pollenDF['Label']
pollenDF['annotator'] = pollenDF['Profile']
pollenDF['annotationDate'] = pollenDF['Timestamp']
pollenAnnotation = pollenDF[annotation_cols]

In [None]:
fullAnnotation = pd.concat([neotomaAnnotation, pollenAnnotation])

In [None]:
fullAnnotation.to_parquet('data/parquet/neotomaAnnotation.parquet', index=False)

## Prediction Parquet

The Prediction PQ file will contain the following columns:
```python
['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']
```

Embed the data using the provided functions (this has been previously trained).

In [None]:
embeddedData = ar.addEmbeddings(preprocessedDF, 'titleSubtitleAbstract')

Do the predictions using the AWS model

In [None]:
predictionsDF = ar.relevancePredict(embeddedData, AWS = True)

Save the columns we are interested on.

In [None]:
predictionCols = ['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']

In [None]:
predictionsDF = predictionsDF[predictionCols]

In [None]:
predictionsDF.to_parquet('data/parquet/neotomaPredictions.parquet', index=False)