In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import  TfidfVectorizer

from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

import utils # all datareading and preprocessing functionality
import entrez_utils # all functions for querying NCBI Entrez

## Validation data

Read and process the "gold standard" data.

In [2]:
curated_pesticide_papers = pd.read_csv('./data/curated_pesticides.txt', sep='\t')
curated_pesticide_papers

Unnamed: 0,pmid,label,tenwise_id,pesticide
0,10234476,animal,TWPHI_00010,carbaryl
1,10369463,human,TWPHI_00003,acetamiprid
2,10583056,human,TWPHI_00010,carbaryl
3,15496540,human,TWPHI_00072,pendimethalin
4,16452832,human,TWPHI_00072,pendimethalin
...,...,...,...,...
159,39415959,animal,TWPHI_00003,acetamiprid
160,39415959,animal,TWPHI_00012,clothianidin
161,39415959,animal,TWPHI_00025,imidacloprid
162,39419870,other,TWPHI_00025,imidacloprid


Fetch abstracts belonging to theses pmid's.

In [None]:
#entrez_utils.init('20af26e91ae36b8ec830da38ca84b872a209', 'michiel.noback@gmail.com')

In [3]:
pub_ids=curated_pesticide_papers['pmid'].to_list()
print(len(pub_ids))
pub_ids[:10]

164


[10234476,
 10369463,
 10583056,
 15496540,
 16452832,
 16704049,
 19248625,
 19934164,
 22361216,
 22393406]

In [None]:
# from importlib import reload
# reload(entrez_utils)

In [None]:
# not required to repeat
#entrez_utils.fetch_abstracts(pub_ids=pub_ids, 
#                             output_file='./data/curated_pesticides_abstracts.txt')

In [10]:
# check the new file
curated_pesticide_abstracts = pd.read_csv('./data/curated_pesticides_abstracts.txt', sep='\t')
print(curated_pesticide_abstracts.shape)
curated_pesticide_abstracts['label'] = 1
curated_pesticide_abstracts['text_label'] = 'pesticide'
curated_pesticide_abstracts

(152, 3)


Unnamed: 0,pmid,title,abstract,label,text_label
0,10234476,Pesticide induced changes of nitric oxide synt...,Organic insecticides are well known neurotoxic...,1,pesticide
1,10369463,Minor structural changes in nicotinoid insecti...,The major nitroimine insecticide imidacloprid ...,1,pesticide
2,10583056,Evidence for double resistance to permethrin a...,A rising prevalence of head lice among school ...,1,pesticide
3,15496540,Pesticides and lung cancer risk in the agricul...,The authors examined the relation between 50 w...,1,pesticide
4,16452832,Pendimethalin exposure and cancer incidence am...,"Pendimethalin, a widely used herbicide, has be...",1,pesticide
...,...,...,...,...,...
147,39399211,Cytotoxicity induced by three commercial neoni...,Neonicotinoid insecticides are used worldwide ...,1,pesticide
148,39402966,Toxicity of pesticide cocktails in amphibian l...,Aquatic communities are increasingly exposed t...,1,pesticide
149,39415959,Neonicotinoid pesticides: evidence of developm...,Neonicotinoids are the most widely used class ...,1,pesticide
150,39419870,An overview on the fate and behavior of imidac...,This review provides an overview on the fate a...,1,pesticide


Strangely enough, 12 papers seem to be lost in this procedure. I will investigate this a bit.

In [11]:
# import Counter to check for duplicates
from collections import Counter
duplicates = [(item, count) for item, count in Counter(pub_ids).items() if count > 1]
duplicates


[(26990785, 2),
 (28385489, 2),
 (29151145, 2),
 (29374591, 2),
 (33288284, 2),
 (38470098, 2),
 (38581179, 2),
 (38799264, 2),
 (39162819, 2),
 (39359636, 2),
 (39415959, 3)]

Apparently, there were 10 duplicates in the collection, as well as one triplicate.  
Otherwise, it seems to be OK. 

I have already downloaded a baseline set, i.e. all the abstracts that were published on several days:
```
data/abstracts_2022_11_01.csv 
data/abstracts_2023_09_01.csv 
data/abstracts_2024_06_01.csv 
data/abstracts_2025_02_01.csv 
data/abstracts_2025_04_09.csv 
data/abstracts_2025_04_15.csv 
data/abstracts_2025_04_23.csv
```

These were combined into one file using a terminal command:
`(head -1 abstracts_2022-11-01.csv && tail -n +2 -q abstracts*.csv ) > all_daily.txt`

(Not in the git repo!)

Now read these into a dataframe and remove duplicates and papers without abstracts.

In [6]:
### DO NOT RERUN THIS CODE 
# baseline_file = "/Users/michielnoback/Library/CloudStorage/OneDrive-HanzehogeschoolGroningen/projects/pesticides/all_daily.txt"

# baseline_abstracts = pd.read_csv(baseline_file, sep='\t')
# print(baseline_abstracts.shape)
# baseline_abstracts.dropna(subset=['abstract'], inplace=True)
# print(baseline_abstracts.shape)
# baseline_abstracts = baseline_abstracts.drop_duplicates(subset=['pmid'], keep='first')
# # only entries with abstract longer than 100 characters
# baseline_abstracts = baseline_abstracts[baseline_abstracts['abstract'].str.len() > 100]

# baseline_abstracts.reset_index(drop=True, inplace=True)
# print(baseline_abstracts.shape)

In [7]:
### DO NOT RERUN THIS CODE 
# CREATE TWO NON-OVERLAPPING SAMPLES OF 5000 ABSTRACTS EACH

# indices = np.arange(len(baseline_abstracts))
# np.random.seed(42)
# np.random.shuffle(indices)

# first_sample_indices = indices[:5000]
# second_sample_indices = indices[5000:10000]

# baseline_train_test = baseline_abstracts.iloc[first_sample_indices]
# baseline_val = baseline_abstracts.iloc[second_sample_indices]

# ## write to file
# baseline_train_test.to_csv('./data/baseline_train_test.txt', sep='\t', index=False)
# baseline_val.to_csv('./data/baseline_val.txt', sep='\t', index=False)

In [9]:
# read the files
baseline_train_test = pd.read_csv('./data/baseline_train_test.txt', sep='\t')
baseline_val = pd.read_csv('./data/baseline_val.txt', sep='\t')

baseline_val.head()

Unnamed: 0,pmid,title,abstract
0,38314258,COVID-19 and cardiovascular complications: upd...,"Coronavirus disease 2019 (COVID-19), caused by..."
1,36786116,New approaches for mental health of social min...,Mental health of social minorities is a challe...
2,40205354,Effects of Bacillus subtilis N24 combined with...,Recent years have witnessed increasingly exten...
3,40130907,Reorganizing the Pt Surface Water Structure fo...,The hydrogen oxidation reaction (HOR) in alkal...
4,40268982,Performance analysis of aquaponics system for ...,The aim of the present study on the performanc...


# Do a simple scan for pesticide terms
To get an indication of the amount of pesticide papers in the baseline

In [12]:
pesticide_classes = ['pesticide', 'acaricide ', 'algicide', 'avicide', 'bactericide', 'fungicide', 
                     'herbicide', 'insecticide', 'molluscicide', 'nematicide', 'rodenticide', 'virucide']

In [None]:
## find terms in either title or abstract column of baseline_train_test
def find_terms_in_columns(df, column_names, terms):
    """
    Find terms in a specified column of a DataFrame.
    
    Args:
        df (pd.DataFrame): The DataFrame to search.
        column_name (str): The name of the column to search.
        terms (list): A list of terms to search for.
        
    Returns:
        pd.DataFrame: A DataFrame containing rows where the specified column contains any of the terms.
    """
    return df[df['title'].str.contains('|'.join(terms), na=False) | 
              df['abstract'].str.contains('|'.join(terms), na=False)]

# Find pesticide classes in the abstract column
baseline_train_test_pesticides = find_terms_in_column(baseline_train_test, 'abstract', pesticide_classes)

#baseline_train_test_pesticides = baseline_train_test[baseline_train_test['abstract'].str.contains('|'.join(pesticide_classes), na=False)]
baseline_train_test_pesticides

Unnamed: 0,pmid,title,abstract
310,36978273,Pesticidal Effect of Leaves Extract of Differe...,&lt;b&gt;Background and Objective:&lt;/b&gt; T...
584,39055766,Abdominal radiographic features of anticoagula...,Anticoagulant rodenticide toxicity is commonly...
1551,38107400,<i>Botrytis cinerea</i> hypovirulent strain △<...,"Gray mold, caused by <i>Botrytis cinerea</i>, ..."
1686,40022367,The diversity and disparity of mineral element...,Mineral elements in fruits are essential for v...
2417,40205232,"Dissipation kinetics, safety evaluation and de...",A field experiment was conducted to evaluate t...
3352,36540705,Plastome characteristics and species identific...,Wintergreen oil is a folk medicine widely used...
3374,38074996,Recent advances in stimuli-response mechanisms...,Nanotechnology-enabled fertilizers and pestici...
3410,40227020,Emergence of Triazole-Resistant <i>Cryptococcu...,The rapid global emergence and spread of resis...
3426,39036748,Quinofumelin (Pesticides).,Food Safety Commission of Japan (FSCJ) conduct...
4149,40226962,RNA interference-based dsRNA application confe...,Rice production is severely impacted by pathog...


In [None]:
# baseline_abstracts['label'] = 0
# baseline_abstracts['text_label'] = 'not_pesticide'

# baseline_abstracts

In [None]:
# create abstract and title columns lowercase and without punctuation
utils.preprocess_text(validation_abstracts)
validation_abstracts.head(3)

In [None]:
#shuffle to prevent batch effects
validation_abstracts = validation_abstracts.sample(frac=1, random_state=42).reset_index(drop=True)
#reset the index
validation_abstracts = validation_abstracts.reset_index(drop=True)
#validation_abstracts['text_label'][:50]

In [None]:
# get a picture of the lengths distribution of the abstracts
import seaborn as sns
plt.figure(figsize=(5, 3))
sns.histplot(validation_abstracts['abstract'].str.split().str.len(), bins=50)
plt.title('Distribution of abstract lengths')
plt.xlabel('Number of words')
plt.ylabel('Count')
#plt.savefig('./figures/abstract_lengths.png', dpi=300, bbox_inches='tight')
plt.show()

That is a strangely bi- or multimodal distribution.  I see sharp drops before 100, 150, 200 and 250 words.  
That has probably to do with the specific requirements posed by the individual journals to which they were submitted.  
However, this is going to be a problem when using the longer abstracts in the context of transformers: they have 
an upper length limit of 512 **after their own tokenization** which, so it seems, can get very high (see notebook `transformers_feature_extraction.ipynb`)

## Train / test data

For model development, I will start with the first collection, load it as always and remove any papers of the curated set from this one, to really have unseen data in the validation dataset.

In [None]:
file1 = './data/abstract_set1.txt'
file2 = './data/abstract_set2.txt'
data_selection = 'abstract'
label_selection = 'label' # can be 'label' or 'text_label'
train_test = utils.read_abstract_data(negatives_path=file2, positives_path=file1)
train_test.head()

In [None]:
# create abstract and title columns lowercase and without punctuation
utils.preprocess_text(train_test)
train_test.head(3)

In [None]:
#shuffle to prevent batch effects
train_test = train_test.sample(frac=1, random_state=42).reset_index(drop=True)
# reset the index
train_test = train_test.reset_index(drop=True)

train_test['text_label'][:20]

## Split train/test
For now, I will work wth the original, non preprocessed abstracts because they are more suitable for CNNs, the 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_test[data_selection], train_test[label_selection], random_state=123)
x_train.head()

## Optimize CNN classifier

In [None]:
# number of classes
K = train_test[label_selection].max() + 1
K

In [None]:
# Convert sentences to sequences
MAX_VOCAB_SIZE = 3000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE) # tokenizer from tensorflow.keras.preprocessing.text
tokenizer.fit_on_texts(x_train)
sequences_train = tokenizer.texts_to_sequences(x_train) # returns a simple python list of lists (of numbers)
sequences_test = tokenizer.texts_to_sequences(x_test)
print(f'train sequences: {len(sequences_train)}; test sequences: {len(sequences_test)}')



## Explore Vector "landscape"

In [None]:
#stopwords = utils.get_stopwords(custom = {'wa', 'use', 'using', 'one', 'two', 'three', 'study'}) 
#tokenizer = utils.Tokenizer(stop_words=stopwords, min_length=3)

# Here, I will use TF-IDF out of the box
#vectorizer = TfidfVectorizer(max_features=MAX_VOCAB_SIZE)