In [1]:
from transformers import pipeline

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import textwrap

from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix

In [2]:
classifier = pipeline("zero-shot-classification") # , device=0 # no GPU available

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# the model properties
classifier.model.config

BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "contradiction": 0,
    "entailment": 2,
    "neutral": 1
  },
  "max_position_embeddings": 1024,
  "model_type": "bart",
  

In [5]:
classifier("This is a great movie", candidate_labels=["positive", "negative"])

{'sequence': 'This is a great movie',
 'labels': ['positive', 'negative'],
 'scores': [0.9969461560249329, 0.003053834196180105]}

In [6]:
# https://en.wikipedia.org/wiki/AMP-activated_protein_kinase
text = "Due to the presence of isoforms of its components, there are 12 " + \
  "versions of AMPK in mammals, each of which can have different tissue " + \
  "localizations, and different functions under different conditions. " + \
  "AMPK is regulated allosterically and by post-translational " + \
  "modification, which work together."
classifier(text, candidate_labels=["biology", "math", "geology"])

{'sequence': 'Due to the presence of isoforms of its components, there are 12 versions of AMPK in mammals, each of which can have different tissue localizations, and different functions under different conditions. AMPK is regulated allosterically and by post-translational modification, which work together.',
 'labels': ['biology', 'math', 'geology'],
 'scores': [0.8908604383468628, 0.06606560200452805, 0.04307394102215767]}

In [7]:
file1 = './data/abstract_set1.txt'
abstracts = pd.read_csv(file1, sep="\t")
abstracts.head()

Unnamed: 0,pmid,title,abstract
0,29990300,Oxidative stress in triazine pesticide toxicit...,This review article provides a summary of the ...
1,29990732,Toxicity induced by glyphosate and glyphosate-...,Glyphosate is the active component of many com...
2,29999060,Direct kinetics study of CH2OO + methyl vinyl ...,Methyl vinyl ketone (MVK) and methacrolein (MA...
3,30003825,Dihydromyricetin improves vascular hyporespons...,CONTEXT: Dihydromyricetin (DMY) has oxidation ...
4,30015122,Glyphosate and atrazine in rainfall and soils ...,The presence in the atmosphere of glyphosate (...


In [None]:
# my own labels: ['pesticide', 'other']
for i in range(5):
    print(abstracts.iloc[i]['abstract'])
    classification = classifier(abstracts.iloc[i]['abstract'], candidate_labels=["pesticide", "other"])
    print(classification['labels'])
    print(classification['scores'])
    print("\n")

This review article provides a summary of the studies relying on oxidative stress biomarkers (lipid peroxidation and antioxidant enzymes in particular) to investigate the effects of atrazine and terbuthylazine exposure in experimental animals and humans published since 2010. In general, experimental animals showed that atrazine and terbuthylazine exposure mostly affected their antioxidant defences and, to a lesser extent, lipid peroxidation, but the effects varied by the species, sex, age, herbicide concentration, and duration of exposure. Most of the studies involved aquatic organisms as useful and sensitive bio-indicators of environmental pollution and important part of the food chain. In laboratory mice and rats changes in oxidative stress markers were visible only with exposure to high doses of atrazine. Recently, our group reported that low-dose terbuthylazine could also induce oxidative stress in Wistar rats. It is evident that any experimental assessment of pesticide toxic effec

The above sample was manually annotated by me (in progress). This result is amazing: 100% accuracy on this (extremely small) sample.
I will try on a slightly more extensive set.

In [29]:
# abstract_set1_labeled = pd.read_csv('./data/abstract_set1_manually_labeled.txt', sep="\t", encoding='utf-16')
# abstract_set1_labeled.head()

# solved with read_excel (below)

In [51]:
abstract_set1_labeled = pd.read_excel('./data/abstract_set1_manually_labeled.xlsx', sheet_name='Sheet1')
#drop na values
abstract_set1_labeled = abstract_set1_labeled.dropna()
#drop duplicates
abstract_set1_labeled = abstract_set1_labeled.drop_duplicates()
#change column type of 'label' to integer
abstract_set1_labeled['label'] = abstract_set1_labeled['label'].astype(int)
#reset index
abstract_set1_labeled = abstract_set1_labeled.reset_index(drop=True)
abstract_set1_labeled.head()


Unnamed: 0,pmid,title,label,abstract
0,29990300,Oxidative stress in triazine pesticide toxicit...,1,This review article provides a summary of the ...
1,29990732,Toxicity induced by glyphosate and glyphosate-...,1,Glyphosate is the active component of many com...
2,29999060,Direct kinetics study of CH2OO + methyl vinyl ...,0,Methyl vinyl ketone (MVK) and methacrolein (MA...
3,30003825,Dihydromyricetin improves vascular hyporespons...,0,CONTEXT: Dihydromyricetin (DMY) has oxidation ...
4,30015122,Glyphosate and atrazine in rainfall and soils ...,1,The presence in the atmosphere of glyphosate (...


In [50]:
#select first 5 rows
# abstract_set1_labeled_selection = abstract_set1_labeled.iloc[:5]
# abstract_set1_labeled_selection


In [52]:
labels = []
scores = []
pmids = []
for i in range(abstract_set1_labeled.shape[0]):
    #print(sample[i])
    classification = classifier(abstract_set1_labeled.iloc[i]['abstract'], candidate_labels=["pesticide", "other"])
    # classification = classifier(sample[i], candidate_labels=["pesticide", "other"])
    labels.append(classification['labels'])
    scores.append(classification['scores'])
    pmids.append(abstract_set1_labeled.iloc[i]['pmid'])

# create a dataframe with the labels and scores
print(labels)
print(scores)

[['pesticide', 'other'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['pesticide', 'other'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['pesticide', 'other'], ['other', 'pesticide'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['pesticide', 'other'], ['pesticide', 'other'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['other', 'pesticide'], ['pesticide', 'other'], ['other', 'pesticide'], ['pesticide', 'other'], ['other', 'pest

In [62]:
#combine the labels and scores into a dataframe
final_labels = []
final_scores = []
for i in range(len(labels)):
    if labels[i][0] == 'pesticide':
        final_labels.append(1)
        final_scores.append(scores[i][0])
    else:
        final_labels.append(0)
        final_scores.append(scores[i][1])


In [63]:

# create a dataframe with the labels and scores
df = pd.DataFrame({'pmid': pmids, 
                   'label': final_labels, 
                   'score': final_scores, 
                   'pmid_check': abstract_set1_labeled['pmid'],
                   'human_label': abstract_set1_labeled['label']})
df

Unnamed: 0,pmid,label,score,pmid_check,human_label
0,29990300,1,0.851642,29990300,1
1,29990732,1,0.793375,29990732,1
2,29999060,0,0.034012,29999060,0
3,30003825,0,0.053161,30003825,0
4,30015122,1,0.787344,30015122,1
...,...,...,...,...,...
255,727998,0,0.047857,727998,0
256,729163,0,0.038211,729163,0
257,730644,0,0.053065,730644,0
258,735999,0,0.080932,735999,0


In [64]:
# create confusion matrix
confusion = confusion_matrix(df['label'], df['human_label'])
confusion

array([[123,  56],
       [  0,  81]])

In [77]:
# which are the rows that have the value 1 for columns 'label' and 'human_label'
true_pos = df[(df['label'] == 1) & (df['human_label'] == 1)]
print(true_pos.shape[0])

true_neg = df[(df['label'] == 0) & (df['human_label'] == 0)]
print(true_neg.shape[0])

false_pos = df[(df['label'] == 1) & (df['human_label'] == 0)]
print(false_pos.shape[0])

false_neg = df[(df['label'] == 0) & (df['human_label'] == 1)]
print(false_neg.shape[0])



81
123
0
56
