<a href="https://colab.research.google.com/github/knc6/jarvis-tools-notebooks/blob/master/jarvis-tools-notebooks/ChemNLP_TitleToAbstract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[ChemNLP](https://github.com/usnistgov/chemnlp) is a Natural Language Processing (NLP) based library for materials chemistry text data. Reference [here](https://doi.org/10.1021/acs.jpcc.3c03106). ChemNLP can be used for (1) curating open access datasets for materials and chemistry literature, developing and comparing traditional machine learning, transformers and graph neural network models for (2) classifying and clustering texts, (3) named entity recognition for large-scale text-mining, (4) abstractive summarization for generating titles of articles from abstracts, (5) text generation for suggesting abstracts from titles, (6) integration with density functional theory dataset for identifying potential candidate materials such as superconductors, and (7) web-interface development for text and reference query.


Lets install a few packages first

In [1]:
%%time
!pip install -q transformers datasets evaluate rouge_score jarvis-tools

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/510.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m419.8/510.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m975.7/975.7 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
%%time
import os
if not os.path.exists('jarvis_leaderboard'):
  !git clone https://github.com/usnistgov/jarvis_leaderboard.git
os.chdir('jarvis_leaderboard')
!pip install -q -e .

Cloning into 'jarvis_leaderboard'...
remote: Enumerating objects: 62679, done.[K
remote: Counting objects: 100% (2691/2691), done.[K
remote: Compressing objects: 100% (434/434), done.[K
remote: Total 62679 (delta 1302), reused 2434 (delta 1067), pack-reused 59988[K
Receiving objects: 100% (62679/62679), 391.73 MiB | 23.20 MiB/s, done.
Resolving deltas: 100% (32724/32724), done.
Updating files: 100% (3656/3656), done.
  Preparing metadata (setup.py) ... [?25l[?25hdone
CPU times: user 368 ms, sys: 46.1 ms, total: 414 ms
Wall time: 42.4 s


In [3]:
%%time
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import torch
import math
from tqdm import tqdm
import time
import evaluate
from collections import defaultdict
from jarvis.db.jsonutils import dumpjson
import random
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

random_seed = 123
torch.manual_seed(random_seed)
random.seed(0)
np.random.seed(random_seed)
torch.backends.cudnn.deterministic = True

rouge_score = evaluate.load("rouge")

# import torch
# torch.cuda.is_available = lambda : False

tqdm.pandas()
device = "cpu"
if torch.cuda.is_available():
    device = torch.device("cuda")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

CPU times: user 8.98 s, sys: 1.2 s, total: 10.2 s
Wall time: 18.1 s


Load openai-gpt2 model from [huggingface](https://huggingface.co/openai-community/gpt2-medium)

In [4]:
%%time
# prompt="Nonuniform superconductivity and Josephson effect in conical ferromagnet can be described as"
model_checkpoint = "gpt2-medium"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)
def generate_text(prompt="What is a superconductor?",max_new_tokens=250,model_checkpoint = "gpt2-medium"):
    # tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    # model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    # model = AutoModelForCausalLM.from_pretrained(checkpoint)
    # outputs = model.generate(**inputs, do_sample=True)
    outputs = model.generate(
        **inputs, do_sample=True, max_new_tokens=max_new_tokens
    )
    out = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return (out[0].replace('\n',' '))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

CPU times: user 3.27 s, sys: 4.69 s, total: 7.96 s
Wall time: 22.6 s


In [5]:
%%time
p = generate_text(prompt="What is a superconductor?")
#print(p)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


CPU times: user 6.51 s, sys: 112 ms, total: 6.62 s
Wall time: 7.46 s


In [6]:
p

'What is a superconductor?  A superconnecting membrane is a device that contains a layer of conductive material that creates electrical impedance (voltage) to conduct electricity between two electrodes. The electrons from two electrodes are turned into electromagnetic waves, called photons, which pass through the membrane and are converted and stored as electrical currents by the electronics.  Supercapacitors are made of a high-strength chemical-metallic alloy, which has a high capacity to store electrical energy of 10 megawatts. The supercapacitors store electricity much faster compared to today\'s high-end commercial products, which typically store it for years, even decades.  What are your superconventional and supercapacitors?  The top level of supercapacitors are called "high-voltage, high-frequency capacitors," and are capable of using up to 12 megawatts of power. But for safety reasons, such capacitors are usually installed at the bottom of the tower, and therefore do not produc

In [7]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [8]:
!ls jarvis_leaderboard/benchmarks/AI/TextGen/arxiv_gen_text.json.zip

jarvis_leaderboard/benchmarks/AI/TextGen/arxiv_gen_text.json.zip


Let's generate article title to abstract for a few condensed matter physics, superconductor articles

In [9]:
import json,zipfile
fname = 'jarvis_leaderboard/benchmarks/AI/TextGen/arxiv_gen_text.json.zip'
temp = 'arxiv_gen_text.json'
zp = zipfile.ZipFile(fname)
train_test = json.loads(zp.read(temp))

In [10]:
from jarvis.db.figshare import data
import pandas as pd
arxiv_summary = data('arxiv_summary')
df = pd.DataFrame(arxiv_summary)
df = df.drop_duplicates()

Obtaining arxiv summary cond.mat dataset 137927...
Reference:https://github.com/usnistgov/chemnlp


100%|██████████| 48.9M/48.9M [00:03<00:00, 13.1MiB/s]


Loading the zipfile...
Loading completed.


We will use the prompt `$title` can be described as `$abstract`

In [11]:
%%time

from tqdm import tqdm
info = {}
for i,j in tqdm(train_test['test'].items()):
  title=df[df['id']==i]['text'].values[0]
  actual_abstract =df[df['id']==i]['ctext'].values[0]
  prompt = title+' can be described as '
  #prompt = 'Write an abstract on the title '+ prompt
  pred = generate_text(prompt)
  info[i] = pred
  #break

  0%|          | 0/98 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 1/98 [00:05<09:10,  5.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  2%|▏         | 2/98 [00:10<08:24,  5.25s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  3%|▎         | 3/98 [00:15<08:11,  5.18s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  4%|▍         | 4/98 [00:21<08:17,  5.29s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  5%|▌         | 5/98 [00:26<07:59,  5.16s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  6%|▌         | 6/98 [00:31<08:04,  5.27s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  7%|▋         | 7/98 [00:36<07:50,  5.17s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 8/98 [00:42<08:07,  5.42s/it]Setting `pad_token

CPU times: user 8min 36s, sys: 1.16 s, total: 8min 37s
Wall time: 8min 53s





In [14]:
from jarvis.db.jsonutils import dumpjson
dumpjson(data=info,filename='textgen.json')

In [18]:
mem=[]
for i,j in info.items():
  info1={}
  info1['id']=str(i)
  info1['prediction']=j
  info1['target']=df[df['id']==i]['ctext'].values[0].split('summarize:')[1]
  mem.append(info1)


In [19]:
import pandas as pd
dff=pd.DataFrame(mem)
dff

Unnamed: 0,id,prediction,target
0,1812.09136,Nonuniform superconductivity and Josephson eff...,"Using the Gorkov equations, we provide an e..."
1,cond-mat/0408037,Further detailing of the Bose-Einstein negativ...,The cause of the sharp and universal optimi...
2,cond-mat/0506765,Tunable critical current for a vortex pinned b...,\\A simple model for a superconductor with ...
3,cond-mat/0105402,"MgB_2 under pressure: phonon calculations, Ram...",The effect of pressure on optical phonon fr...
4,cond-mat/0401637,Dissipative Currents in Superfluid 3He Weak Li...,We calculate the current-pressure relation ...
...,...,...,...
93,cond-mat/0306416,Doping dependent time-reversal symmetric nonli...,We have measured the temperature dependent ...
94,1510.08177,Electronic structure of a superconducting topo...,Using high-resolution angle-resolved photoe...
95,2008.07814,Theory of Supercurrent in Superconductors can ...,In the standard theory of superconductivity...
96,1205.5540,Evidence of strong correlations at the van Hov...,We present realistic multiband calculations...


In [20]:
dff.to_csv('AI-TextGen-text-arxiv_gen-test-rouge.csv',index=False)



In [21]:
!zip AI-TextGen-text-arxiv_gen-test-rouge.csv.zip AI-TextGen-text-arxiv_gen-test-rouge.csv

  adding: AI-TextGen-text-arxiv_gen-test-rouge.csv (deflated 64%)


In [25]:
from rouge import Rouge
rouge = Rouge()
hypothesis = dff["prediction"]
reference = dff["target"]
scores = rouge.get_scores(hypothesis, reference, avg=True)["rouge-1"]["r"]
print("rouge scores", scores)

rouge scores 0.21204219423627227


See benchmark here: https://pages.nist.gov/jarvis_leaderboard/AI/TextGen/