In [1]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime
import src.article_relevance as ar

  from .autonotebook import tqdm as notebook_tqdm


# Notebook to Prepare Training Parquet Files

## Fixed Neotoma & Pollen files from CrossRef

In [2]:
neotoma = pd.read_csv('data/raw/neotoma_crossref_fixed.csv')
pollenDF = pd.read_csv('data/raw/pollen_doc_labels.csv')
labeledDF = pd.read_csv('data/raw/project_2_labeled_data.csv')

Remove duplicates of DOIs and lower case them all.

In [3]:
neotoma['doi'] = neotoma['doi'].str.lower()
neotoma = neotoma.drop_duplicates(subset='doi', keep='first')


In [4]:
pollenDF['doi'] = pollenDF['doi'].str.lower()
pollenDF = pollenDF.drop_duplicates(subset='doi', keep='first')

In [5]:
labeledDF['doi'] = labeledDF['doi'].str.lower()
labeledDF = labeledDF.drop_duplicates(subset='doi', keep='first')

Extracting DOIs and extracing data from CrossRef

In [6]:
neotoma.shape[0]

758

In [7]:
pollenDF.shape[0]

494

In [8]:
labeledDF.shape[0]

1514

In [9]:
# Takes 15-18 minutes
warnings.filterwarnings("ignore")
print(datetime.now())
neotomaCrossRef = ar.crossRefQuery(neotoma['doi'].unique().tolist())
pollenCrossRef =  ar.crossRefQuery(pollenDF['doi'].unique().tolist())
labeledCrossRef = ar.crossRefQuery(labeledDF['doi'].unique().tolist())
print(datetime.now())

2023-10-06 22:41:35.973298
2023-10-06 22:41:35,974 - crossRefQuery.py:25 - crossRefQuery - INFO - 758 DOIs to be queried from CrossRef
2023-10-06 22:46:52,422 - crossRefQuery.py:46 - crossRefQuery - INFO - CrossRef Query Finished.
2023-10-06 22:46:52,428 - crossRefQuery.py:25 - crossRefQuery - INFO - 494 DOIs to be queried from CrossRef
2023-10-06 22:50:17,034 - crossRefQuery.py:46 - crossRefQuery - INFO - CrossRef Query Finished.
2023-10-06 22:50:17,040 - crossRefQuery.py:25 - crossRefQuery - INFO - 1514 DOIs to be queried from CrossRef
2023-10-06 23:00:57,451 - crossRefQuery.py:46 - crossRefQuery - INFO - CrossRef Query Finished.
2023-10-06 23:00:57.456362


Concatenate the two dataframes.

In [10]:
df = pd.concat([neotomaCrossRef, pollenCrossRef, labeledCrossRef])
df = df.reset_index(drop=True)
df.head(3)

Unnamed: 0,DOI,title,subtitle,author,subject,abstract,container-title,language,published,publisher,URL,CrossRefQueryDate
0,10.1016/j.quascirev.2014.04.014,[Postglacial history of the Patagonian forest/...,[],[{'ORCID': 'http://orcid.org/0000-0001-5732-37...,"[Geology, Archeology, Archeology, Ecology, Evo...",,[Quaternary Science Reviews],en,"{'date-parts': [[2014, 6]]}",Elsevier BV,http://dx.doi.org/10.1016/j.quascirev.2014.04.014,2023-10-06 22:41:36.395841
1,10.1002/gea.10020,"[Taphonomic analysis, associational integrity,...",[],"[{'given': 'David A.', 'family': 'Byers', 'seq...","[Earth and Planetary Sciences (miscellaneous),...",,[Geoarchaeology],en,"{'date-parts': [[2002, 4, 5]]}",Wiley,http://dx.doi.org/10.1002/gea.10020,2023-10-06 22:41:37.461996
2,10.1139/e80-122,[Holocene vegetation and climatic history of P...,[],"[{'given': 'T. W.', 'family': 'Anderson', 'seq...",[General Earth and Planetary Sciences],<jats:p> The vegetation and climate of the Hol...,[Canadian Journal of Earth Sciences],en,"{'date-parts': [[1980, 9, 1]]}",Canadian Science Publishing,http://dx.doi.org/10.1139/e80-122,2023-10-06 22:41:37.858999


In [11]:
df.shape[0]

2733

In [12]:
df['DOI'].value_counts()

DOI
10.1002/etc.5602                   2
10.1513/annalsats.202212-996cme    2
10.1186/s12870-023-04189-9         2
10.1038/s41598-023-33856-y         2
10.1016/j.alit.2023.01.003         2
                                  ..
10.1126/science.204.4394.701       1
10.1016/j.quaint.2008.04.021       1
10.1007/s10021-009-9284-y          1
10.3955/046.083.0107               1
10.1371/journal.pone.0259136       1
Name: count, Length: 2670, dtype: int64

In [13]:
#Remove duplicates for whatever reason
df['DOI'] = df['DOI'].str.lower()
df = df.drop_duplicates(subset='DOI', keep='first')

In [14]:
df['DOI'].value_counts()

DOI
10.1540/jsmr.59.28                              1
10.1016/j.quascirev.2014.04.014                 1
10.1002/gea.10020                               1
10.1139/e80-122                                 1
10.1016/j.quaint.2015.05.009                    1
                                               ..
10.1130/0091-7613(1975)3<344:alqrov>2.0.co;2    1
10.5194/bg-2016-354                             1
10.1080/02724634.2015.1113803                   1
10.1016/s0367-2530(17)30981-7                   1
10.1016/j.quaint.2005.03.004                    1
Name: count, Length: 2670, dtype: int64

In [15]:
df.shape[0]

2670

In [16]:
#df.to_csv('data/raw/data_full.csv',  index=False)

In [17]:
#df = pd.read_csv('data/raw/data_full.csv')
#df.head(2)

In [18]:
df.loc[1, 'subject']

['Earth and Planetary Sciences (miscellaneous)', 'Archeology', 'Archeology']

In [19]:
preprocessedDF = ar.dataPreprocessing(df)

2023-10-06 23:04:54,507 - dataPreprocessing.py:26 - dataPreprocessing - INFO - Data cleaning and parsing begins.
2023-10-06 23:04:54,585 - dataPreprocessing.py:45 - dataPreprocessing - INFO - Running article language imputation.
2023-10-06 23:04:54,616 - dataPreprocessing.py:65 - dataPreprocessing - INFO - 251 articles require language imputation
2023-10-06 23:04:54,618 - dataPreprocessing.py:68 - dataPreprocessing - INFO - 5 cannot be imputed due to too short text metadata (title, subtitle and abstract are less than 5 characters).
2023-10-06 23:04:56,314 - dataPreprocessing.py:76 - dataPreprocessing - INFO - Missing language imputation completed
2023-10-06 23:04:56,316 - dataPreprocessing.py:77 - dataPreprocessing - INFO - After imputation, there are 23 non-English articles in total excluded from the prediction pipeline.
2023-10-06 23:04:56,540 - dataPreprocessing.py:93 - dataPreprocessing - INFO - Data Preprocessing Completed. 2670 valid observations.


In [20]:
preprocessedDF.head(2)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,subtitle,title,titleSubtitleAbstract,validForPrediction
0,2023-10-06 22:41:36.395841,10.1016/j.quascirev.2014.04.014,http://dx.doi.org/10.1016/j.quascirev.2014.04.014,,[{'ORCID': 'http://orcid.org/0000-0001-5732-37...,Quaternary Science Reviews,en,"{'date-parts': [[2014, 6]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",,Postglacial history of the Patagonian forest/s...,postglacial history of the patagonian forest/s...,1
1,2023-10-06 22:41:37.461996,10.1002/gea.10020,http://dx.doi.org/10.1002/gea.10020,,"[{'given': 'David A.', 'family': 'Byers', 'seq...",Geoarchaeology,en,"{'date-parts': [[2002, 4, 5]]}",Wiley,"[Earth and Planetary Sciences (miscellaneous),...",,"Taphonomic analysis, associational integrity, ...","taphonomic analysis, associational integrity, ...",1


In [21]:
preprocessedDF['DOI'].value_counts()

DOI
10.1540/jsmr.59.28                              1
10.1016/j.quascirev.2014.04.014                 1
10.1002/gea.10020                               1
10.1139/e80-122                                 1
10.1016/j.quaint.2015.05.009                    1
                                               ..
10.1130/0091-7613(1975)3<344:alqrov>2.0.co;2    1
10.5194/bg-2016-354                             1
10.1080/02724634.2015.1113803                   1
10.1016/s0367-2530(17)30981-7                   1
10.1016/j.quaint.2005.03.004                    1
Name: count, Length: 2670, dtype: int64

Save this as 1 parquet file.

In [22]:
preprocessedDF.shape

(2670, 14)

In [23]:
preprocessedDF.loc[1, 'subject']

['Earth and Planetary Sciences (miscellaneous)', 'Archeology', 'Archeology']

In [24]:
preprocessedDF.head(1)

Unnamed: 0,CrossRefQueryDate,DOI,URL,abstract,author,container-title,language,published,publisher,subject,subtitle,title,titleSubtitleAbstract,validForPrediction
0,2023-10-06 22:41:36.395841,10.1016/j.quascirev.2014.04.014,http://dx.doi.org/10.1016/j.quascirev.2014.04.014,,[{'ORCID': 'http://orcid.org/0000-0001-5732-37...,Quaternary Science Reviews,en,"{'date-parts': [[2014, 6]]}",Elsevier BV,"[Geology, Archeology, Archeology, Ecology, Evo...",,Postglacial history of the Patagonian forest/s...,postglacial history of the patagonian forest/s...,1


In [25]:
# In trainingDataSetUp the file is saved to AWS
preprocessedDF.to_parquet('data/parquet/publicationMetadataDF.parquet', engine='fastparquet', compression='snappy')

In [26]:
preprocessedDF.shape

(2670, 14)

## Annotation Data File

We need a 2nd file that contains the following info:
```python
['DOI', 'Annotation', 'Annotator', 'Annotation_Date', 'Verified', 'Verified By', 'Verified Timestamp']
```

In the `pollen.csv` file Dr. Goring has done already some manual annotation of whether a particular article would belong to Neotoma or not. All files in the `neotoma.csv` file belong to Neotoma. 

In a different setting, this annotations would be made using the **Data Review Tool** and stored to complement the parquet file, for now, I am going to do a JSON

In [27]:

annotation_cols = ['DOI', 'annotation', 'annotator', 'annotationDate', 'verified', 'verifiedBy', 'verifiedTimeStamp']

In [28]:
neotoma['DOI'] = neotoma['doi']
neotoma['annotation'] = 'Neotoma'
neotoma['annotator'] = 'Simon J. Goring'
neotoma['annotationDate'] = datetime.now()
neotoma['annotationDate'] = neotoma['annotationDate'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
neotoma['verified'] = 'No'
neotoma['verifiedBy'] = np.NaN
neotoma['verifiedTimeStamp'] = np.NaN
neotomaAnnotation = neotoma[annotation_cols]

In [29]:
pollenDF['DOI'] = pollenDF['doi']
pollenDF['annotation'] = pollenDF['Label']
pollenDF['annotator'] = pollenDF['Profile']
pollenDF['annotationDate'] = pollenDF['Timestamp']
#pollenDF['annotationDate'] = pollenDF['annotationDate'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))
pollenDF['verified'] = pollenDF['Verified']
pollenDF['verifiedBy'] = pollenDF['Verified By']
pollenDF['verifiedTimeStamp'] = pollenDF['Verified Timestamp']
pollenAnnotation = pollenDF[annotation_cols]

In [30]:
labeledDF['DOI'] = labeledDF['doi']
labeledDF['annotation'] = labeledDF['Label']
labeledDF['annotator'] = labeledDF['Profile']
labeledDF['annotationDate'] = labeledDF['Timestamp']
labeledDF['verified'] = labeledDF['Verified']
labeledDF['verifiedBy'] = labeledDF['Verified By']
labeledDF['verifiedTimeStamp'] = labeledDF['Verified Timestamp']
labeledAnnotation = labeledDF[annotation_cols]

In [31]:
fullAnnotation = pd.concat([neotomaAnnotation, pollenAnnotation, labeledAnnotation])
fullAnnotation = fullAnnotation.reset_index(drop=True)
fullAnnotation.head(3)

Unnamed: 0,DOI,annotation,annotator,annotationDate,verified,verifiedBy,verifiedTimeStamp
0,10.1016/0033-5894(73)90004-5,Neotoma,Simon J. Goring,2023-10-06 23:05:47,No,,
1,10.1139/e80-122,Neotoma,Simon J. Goring,2023-10-06 23:05:47,No,,
2,10.1017/s0033822200004781,Neotoma,Simon J. Goring,2023-10-06 23:05:47,No,,


In [32]:
fullAnnotation['DOI'].value_counts()

DOI
10.3390/plants12020373                           2
10.5114/ada.2022.117978                          2
10.3389/fgeed.2022.1085023                       2
10.3389/fpls.2022.1096804                        2
10.1016/j.alit.2023.01.003                       2
                                                ..
10.2307/1939818                                  1
10.2307/1935709                                  1
10.2307/1942485                                  1
10.1130/0016-7606(1975)86<287:vrftly>2.0.co;2    1
10.1016/j.marpolbul.2020.111757                  1
Name: count, Length: 2699, dtype: int64

In [33]:
fullAnnotation[fullAnnotation['DOI'] == '10.3390/plants12020373']

Unnamed: 0,DOI,annotation,annotator,annotationDate,verified,verifiedBy,verifiedTimeStamp
1243,10.3390/plants12020373,Not Neotoma,SimonGoring,2023-05-11 18:49:49.397021-04:00,No,,NaT
1461,10.3390/plants12020373,Not Neotoma,SimonGoring,2023-05-11 23:18:58.026894-04:00,No,,


In [34]:
fullAnnotation['DOI'] = fullAnnotation['DOI'].str.lower()
fullAnnotation = fullAnnotation.drop_duplicates(subset='DOI', keep='first')

In [35]:
fullAnnotation.shape[0]

2700

In [36]:
preprocessedDF.shape[0]

2670

In [37]:
duplicates_mask = fullAnnotation.duplicated()
fullAnnotation = fullAnnotation[~duplicates_mask]
fullAnnotation.shape[0]

2700

In [38]:
#fullAnnotation.to_parquet('data/parquet/AnnotationDF.parquet', index=False)

## Prediction Parquet

The Prediction PQ file will contain the following columns:
```python
['DOI', 'prediction', 'predict_proba', 'model_metadata', 'prediction_date']
```

Embed the data using the provided functions (this has been previously trained).

### Please Continue on the Training_Model Notebook
#### (you can skip directly to the Prediction Parquet Section)