## 0. Setup

### 0.1 Install libraries

In [46]:
%pip install transformers
%pip install scikit-multilearn
%pip install joblib


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### 0.2 Load data

In [47]:
import pandas as pd

papers = pd.read_csv("../../data/processed/papers.csv")
paper_to_classification_code = pd.read_csv("../../data/processed/paper_to_classification_code.csv")
classification_codes = pd.read_csv("../../data/processed/classification_codes.csv")

In [48]:
papers.dropna(subset=["id", "title", "abstract"], inplace=True)
paper_to_classification_code.dropna(inplace=True)
classification_codes.dropna(inplace=True)

In [49]:
joined_data = (
    papers
    .merge(paper_to_classification_code, left_on="id", right_on="paper_id")
    .merge(classification_codes, on="code")
    .groupby("id")
    .agg({"abbreviation": lambda x: list(x), "title": "first", "abstract": "first"})
)
joined_data.head()

Unnamed: 0_level_0,abbreviation,title,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2-s2.0-84907768931,[SOCI],Changing patterns of civil-military relations ...,"© 1998 Taylor & Francis, except for Chapter 2,..."
2-s2.0-84946771902,[ECON],THE great transition: Implications from enviro...,© 2018 World Scientific Publishing Company.We ...
2-s2.0-84946882107,"[ARTS, SOCI, SOCI]",A teacher's perspective on autonomy and self-a...,"© 2015 Informa UK Limited, trading as Taylor &..."
2-s2.0-84957541768,"[COMP, ENGI, COMP]",A multi-objective car sequencing problem on tw...,"© 2016, Springer Science+Business Media New Yo..."
2-s2.0-84963739401,"[MEDI, MEDI, MEDI, MEDI]",Cumulative radiation exposure and estimated li...,"© 2016, Springer-Verlag Berlin Heidelberg.Purp..."


## 1. Tokenization

(From https://huggingface.co/docs/tokenizers/en/quicktour)

In [50]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [51]:
from tokenizers.trainers import BpeTrainer

# Initialize a trainer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=30000)

In [52]:
# Normalize the input
# The NFD normalizer decomposes combined characters into their base characters
# The Lowercase normalizer lowercases the input
# Example: "Hellö, wòrld!" -> "hellö, wòrld!"

from tokenizers.normalizers import Lowercase, NFD, Sequence
tokenizer.normalizer = Sequence([NFD(), Lowercase()])

# Test the normalizer
tokenizer.normalizer.normalize_str("Hellö, wòrld!")

'hellö, wòrld!'

In [53]:
# Split the input on whitespace and punctuation
# Example: "hello, world!" -> ["hello", ",", "world", "!"]

from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
tokenizer.pre_tokenizer = Sequence([Whitespace(), Punctuation()])

# Test the pre-tokenizer
tokenizer.pre_tokenizer.pre_tokenize_str("hello, world!")

[('hello', (0, 5)), (',', (5, 6)), ('world', (7, 12)), ('!', (12, 13))]

In [54]:
# Train the tokenizer with our title and abstract data
tokenizer.train_from_iterator((*joined_data["title"], *joined_data["abstract"]), trainer=trainer)

In [55]:
# Test our tokenizer
text = "Bacillus sp. strain M10 as a potential biocontrol agent protecting chili pepper and tomato fruits from anthracnose disease caused by Colletotrichum capsici,BioControl"
output = tokenizer.encode(text)

print(output.ids)
print(output.tokens)
print(tokenizer.decode(output.ids))

[3984, 699, 18, 1446, 55, 941, 626, 43, 1176, 17169, 3003, 12139, 19050, 18559, 624, 18667, 8640, 732, 16689, 8981, 1279, 2624, 709, 27806, 13038, 1043, 16, 17169]
['bacillus', 'sp', '.', 'strain', 'm', '10', 'as', 'a', 'potential', 'biocontrol', 'agent', 'protecting', 'chili', 'pepper', 'and', 'tomato', 'fruits', 'from', 'anthrac', 'nose', 'disease', 'caused', 'by', 'colletotrichum', 'caps', 'ici', ',', 'biocontrol']
bacillus sp . strain m 10 as a potential biocontrol agent protecting chili pepper and tomato fruits from anthrac nose disease caused by colletotrichum caps ici , biocontrol


In [56]:
# Save our tokenizer
tokenizer.save("../../models/tokenizer.json")

## 2. Model Training

### 2.0 Setup

In [57]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("../../models/tokenizer.json")

### 2.1 TF-IDF Vectorization

TF-IDF (Term Frequency-Inverse Document Frequency) Vectorization is a way to represent words numerically based on how frequent each word appears in a document relative to other documents.

See more:

- https://en.wikipedia.org/wiki/Tf%E2%80%93idf
- https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

# We wrap the tokenizer in a class to avoid pickling issues
class SerializableTokenizer:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, text):
        return self.tokenizer.encode(text).tokens

tfidf = TfidfVectorizer(tokenizer=SerializableTokenizer(tokenizer))
X = tfidf.fit_transform(joined_data["title"] + " " + joined_data["abstract"])



In [59]:
joblib.dump(tfidf, "../../models/tfidf_vectorizer.pkl")

['../../models/tfidf_vectorizer.pkl']

### 2.2 Multi-Label Encoding

Since each paper can have more than one subjects, we will use multi-label encoding to encode our subjects

See more:

- https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

In [60]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(joined_data["abbreviation"])

In [61]:
mlb.classes_

array(['AGRI', 'ARTS', 'BIOC', 'BUSI', 'CENG', 'CHEM', 'COMP', 'DECI',
       'DENT', 'EART', 'ECON', 'ENER', 'ENGI', 'ENVI', 'HEAL', 'IMMU',
       'MATE', 'MATH', 'MEDI', 'MULT', 'NEUR', 'NURS', 'PHAR', 'PHYS',
       'PSYC', 'SOCI', 'VETE'], dtype=object)

In [62]:
y

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [63]:
joblib.dump(mlb, "../../models/multilabel_binarizer.pkl")

['../../models/multilabel_binarizer.pkl']

### 2.3 Splitting our data into train and test datasets

Since Scikit-learn doesn't support multi-label stratification, we will use `iterative_train_test_split` from `scikit-multilearn` library

See more:

- https://stackoverflow.com/questions/53378970/how-to-perform-multilabel-stratified-sampling
- http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html
- https://github.com/scikit-multilearn/scikit-multilearn/issues/147 (How to set seed for `iterative_train_test_split`)

In [64]:
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np

In [65]:
X.shape, y.shape

((19550, 28825), (19550, 27))

In [66]:
np.random.seed(56164)
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((15706, 28825), (15706, 27), (3844, 28825), (3844, 27))

### 2.4 Training the model

We will use multi-label logistic regression from Scikit-learn to estimate the probability of each subject.

Grid search is usedfor hyperparameter tuning.

See more:

- https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.GridSearchCV.html

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [68]:
model = MultiOutputClassifier(
    LogisticRegression(class_weight="balanced", max_iter=1000)
)

In [69]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid={
        "estimator__C": [0.1, 1, 10, 100],
    },
    n_jobs=1,
    verbose=10,
)

In [None]:
grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START estimator__C=0.1............................................
[CV 1/5; 1/4] END .............estimator__C=0.1;, score=0.119 total time=  12.2s
[CV 2/5; 1/4] START estimator__C=0.1............................................
[CV 2/5; 1/4] END .............estimator__C=0.1;, score=0.107 total time=  13.6s
[CV 3/5; 1/4] START estimator__C=0.1............................................
[CV 3/5; 1/4] END .............estimator__C=0.1;, score=0.092 total time=   8.9s
[CV 4/5; 1/4] START estimator__C=0.1............................................
[CV 4/5; 1/4] END .............estimator__C=0.1;, score=0.103 total time=  14.0s
[CV 5/5; 1/4] START estimator__C=0.1............................................
[CV 5/5; 1/4] END .............estimator__C=0.1;, score=0.091 total time=  13.2s
[CV 1/5; 2/4] START estimator__C=1..............................................
[CV 1/5; 2/4] END ...............estimator__C=1;,

In [73]:
y_pred = model.predict(X_test)

In [74]:
tfidf.inverse_transform(X_test)

[array(['of', '-', 'and', 'thailand', '’', 's', '©', 'taylor', '&',
        'francis', ',', 'for', 'which', '.', 'the', 'with', 'from', 'in',
        'institutions', 'have', 'this', 'how', 'it', 'on', ':', 'we', 'an',
        'a', "'", 'show', 'development', 'are', 'used', 'as', 'to',
        'teacher', 'perspective', 'autonomy', 'self', 'access', 'theory',
        'perception', 'practice', '2015', 'informa', 'uk', 'limited',
        'trading', 'group', 'considerable', 'resources', 'exp', 'ended',
        'learner', '(', 'la', ')', 'particular', 'provision', 'facilities',
        'one', 'most', 'common', 'ways', 'tried', 'foster', 'whether',
        'intended', 'outcomes', 'achieved', 'depends', 'large', 'part',
        'teachers', 'agreement', 'understanding', 'rationale', 'behind',
        'these', 'efforts', 'mismatch', 'between', 'institutional',
        'objective', 'classroom', 'implementation', 'may', 'example',
        'negatively', 'impact', 'student', 'experience', 'project',

In [75]:
mlb.inverse_transform(y_test)

[('ARTS', 'SOCI'),
 ('COMP', 'ENGI'),
 ('ENGI', 'ENVI', 'SOCI'),
 ('MEDI', 'NEUR'),
 ('CENG', 'CHEM', 'MATE'),
 ('MEDI', 'PSYC'),
 ('DENT', 'SOCI'),
 ('ENER', 'ENGI', 'PHYS'),
 ('EART', 'ENVI'),
 ('MEDI',),
 ('NEUR',),
 ('AGRI', 'EART', 'ENVI'),
 ('COMP', 'MATH', 'SOCI'),
 ('BUSI', 'CENG', 'CHEM', 'MATE'),
 ('ENVI',),
 ('ENGI',),
 ('MEDI',),
 ('BIOC', 'MEDI'),
 ('ENGI', 'MEDI', 'SOCI'),
 ('IMMU', 'VETE'),
 ('AGRI', 'EART'),
 ('ENGI', 'MATH'),
 ('CHEM', 'MATE', 'PHYS'),
 ('HEAL', 'PHAR'),
 ('BIOC', 'IMMU'),
 ('MEDI', 'NEUR'),
 ('PSYC',),
 ('ENVI',),
 ('MEDI',),
 ('NEUR', 'PHAR'),
 ('CHEM', 'ENGI', 'MATE'),
 ('COMP',),
 ('DENT',),
 ('NEUR',),
 ('BIOC', 'ENGI', 'MEDI', 'PHAR'),
 ('HEAL', 'MEDI'),
 ('CENG', 'CHEM', 'MATE'),
 ('ENGI', 'ENVI'),
 ('ARTS', 'MEDI', 'SOCI'),
 ('EART', 'ENGI', 'SOCI'),
 ('BIOC',),
 ('COMP', 'ENGI'),
 ('COMP', 'ENGI'),
 ('BIOC', 'DENT', 'MEDI'),
 ('MEDI',),
 ('EART',),
 ('NEUR', 'PHAR'),
 ('MEDI',),
 ('AGRI', 'BIOC', 'CHEM'),
 ('NEUR',),
 ('MEDI',),
 ('ENGI',),
 (

In [76]:
mlb.inverse_transform(y_pred)

[('ARTS', 'SOCI'),
 ('DECI', 'ENGI'),
 ('ENVI',),
 ('PHAR',),
 ('CHEM', 'ENGI'),
 ('MEDI',),
 ('DENT', 'SOCI'),
 ('AGRI', 'ENER', 'ENGI'),
 ('CHEM', 'SOCI'),
 ('BIOC', 'MEDI'),
 ('NEUR', 'PHAR'),
 ('EART', 'ENGI', 'MATE'),
 ('COMP', 'SOCI'),
 ('CHEM', 'MATE'),
 (),
 (),
 ('MEDI',),
 ('MEDI',),
 ('MEDI',),
 ('IMMU', 'MEDI'),
 ('AGRI',),
 ('ENGI',),
 ('ENGI', 'MATE'),
 ('HEAL', 'MEDI', 'PHAR'),
 ('IMMU',),
 ('BIOC', 'MEDI', 'NEUR', 'PHAR'),
 ('MEDI', 'PSYC'),
 ('CHEM', 'ENGI', 'MATE'),
 ('MEDI',),
 (),
 ('MATE', 'PHYS'),
 ('ENGI',),
 ('DENT',),
 ('MEDI', 'NEUR'),
 ('AGRI',),
 ('MEDI',),
 ('BIOC', 'CHEM', 'MATE'),
 ('ENER',),
 ('MEDI',),
 ('EART', 'ENGI'),
 ('MEDI', 'PHAR'),
 ('COMP', 'MATH'),
 ('COMP', 'ENGI'),
 ('DENT', 'MEDI'),
 ('IMMU', 'MEDI'),
 ('ENVI',),
 ('MEDI', 'NEUR', 'PSYC'),
 ('MEDI',),
 ('AGRI', 'BIOC'),
 ('NEUR', 'PHAR'),
 ('MEDI',),
 ('EART',),
 ('COMP', 'MATH'),
 ('BUSI',),
 ('MATE', 'PHYS'),
 ('MATH',),
 ('MATH',),
 ('AGRI',),
 ('BIOC', 'CHEM', 'PHAR'),
 ('BIOC',),
 ('BU

In [None]:
import pickle

with open("../../models/multilabel_classification_model.pkl", "wb") as file:
    pickle.dump(model, file)

In [43]:
import joblib

joblib.dump({
    "tfidf": tfidf,
    "mlb": mlb,
    "model": model,
}, "../../models/pipeline/pipeline.pkl")

['../../models/pipeline/pipeline.pkl']

## 3. Model Evaluation

### 3.1 Load additional data

In [94]:
subjects = pd.read_csv("../../data/processed/subjects.csv")

In [95]:
subjects.head()

Unnamed: 0,code_prefix,subject_area,supergroup,abbreviation
0,10,Multidisciplinary,Multidisciplinary,MULT
1,11,Agricultural and Biological Sciences,Life Sciences,AGRI
2,12,Arts and Humanities,Social Sciences,ARTS
3,13,"Biochemistry, Genetics and Molecular Biology",Life Sciences,BIOC
4,14,"Business, Management, and Accounting",Social Sciences,BUSI


### 3.2 Evaluation by subject area

In [None]:
# Convert mlb.classes_ (abbreviation) to full subject names
abbreviation_to_subject = subjects.set_index("abbreviation")["subject_area"].to_dict()
full_subject_names = [abbreviation_to_subject[abbreviation] for abbreviation in mlb.classes_]
full_subject_names

['Agricultural and Biological Sciences',
 'Arts and Humanities',
 'Biochemistry, Genetics and Molecular Biology',
 'Business, Management, and Accounting',
 'Chemical Engineering',
 'Chemistry',
 'Computer Science',
 'Decision Sciences',
 'Dentistry',
 'Earth and Planetary Sciences',
 'Economics, Econometrics and Finance',
 'Energy',
 'Engineering',
 'Environmental Science',
 'Health Professions',
 'Immunology and Microbiology',
 'Materials Science',
 'Mathematics',
 'Medicine',
 'Multidisciplinary',
 'Neuroscience',
 'Nursing',
 'Pharmacology, Toxicology, and Pharmaceutics',
 'Physics and Astronomy',
 'Psychology',
 'Social Sciences',
 'Veterinary']

In [91]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=full_subject_names))

                                              precision    recall  f1-score   support

        Agricultural and Biological Sciences       0.57      0.55      0.56       373
                         Arts and Humanities       0.67      0.48      0.56        93
Biochemistry, Genetics and Molecular Biology       0.42      0.38      0.40       480
        Business, Management, and Accounting       0.56      0.53      0.55       118
                        Chemical Engineering       0.49      0.46      0.47       330
                                   Chemistry       0.55      0.57      0.56       430
                            Computer Science       0.68      0.64      0.66       360
                           Decision Sciences       0.32      0.36      0.34        70
                                   Dentistry       0.88      0.82      0.85        88
                Earth and Planetary Sciences       0.65      0.55      0.60       139
         Economics, Econometrics and Finance       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


From the confusion matrix, our weighted F1 score is 0.56. This means that our model is decent at predicting the subjects of the papers.

We can see that the model is decent at predicting subject areas such as `Dentistry`, `Medicine`, `Veterinary`, `Computer Science`, and `Physics and Astronomy`.

However, the model is not good at predicting subject areas such as `Psychology`, `Pharmacology, Toxicology, and Pharmaceutics`, `Decision Sciences`, and `Health Professions`.

### 3.3 Evaluation by supergroups

For each supergroup, if there are any subject areas present in a paper, then the supergroup is considered present in the paper.

In [164]:
supergroups = (
    subjects
    .set_index("abbreviation")
    .loc[mlb.classes_]
    .groupby("supergroup")
    .apply(lambda x: x.index.tolist())
    .to_dict()
)
supergroups

  .apply(lambda x: x.index.tolist())


{'Health Sciences': ['DENT', 'HEAL', 'MEDI', 'NURS', 'VETE'],
 'Life Sciences': ['AGRI', 'BIOC', 'IMMU', 'NEUR', 'PHAR'],
 'Multidisciplinary': ['MULT'],
 'Physical Sciences': ['CENG',
  'CHEM',
  'COMP',
  'EART',
  'ENER',
  'ENGI',
  'ENVI',
  'MATE',
  'MATH',
  'PHYS'],
 'Social Sciences': ['ARTS', 'BUSI', 'DECI', 'ECON', 'PSYC', 'SOCI']}

In [167]:
# Convert y_test and y_pred to subject supergroups and calculate the classification report
def convert_to_supergroups(y):
    y = pd.DataFrame(y, columns=mlb.classes_)
    y_converted = pd.DataFrame(0, index=y.index, columns=supergroups.keys())
    for supergroup, subjects in supergroups.items():
        for subject in subjects:
            y_converted[supergroup] |= y[subject]
    return y_converted

y_test_converted = convert_to_supergroups(y_test)
y_pred_converted = convert_to_supergroups(y_pred)

print(classification_report(y_test_converted, y_pred_converted, target_names=supergroups.keys()))

                   precision    recall  f1-score   support

  Health Sciences       0.85      0.80      0.82      1264
    Life Sciences       0.65      0.58      0.61      1073
Multidisciplinary       0.68      0.33      0.44       217
Physical Sciences       0.89      0.85      0.87      1842
  Social Sciences       0.71      0.63      0.67       484

        micro avg       0.80      0.73      0.77      4880
        macro avg       0.76      0.64      0.68      4880
     weighted avg       0.80      0.73      0.76      4880
      samples avg       0.78      0.78      0.76      4880



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


For supergroups, our weighted F1 score is 0.76, which means that our model is good at predicting the supergroups.

The model is good at predicting most supergroups, but not good at predicting `Multidisciplinary` and `Social Sciences`.

## 4. Model Usage

In [70]:
import joblib

pipeline = joblib.load("../../models/pipeline/pipeline.pkl")
tfidf, mlb, model = pipeline["tfidf"], pipeline["mlb"], pipeline["model"]

In [155]:
predict_title = """Ni-Co Double Hydroxide Grown on Graphene Oxide for Enhancing Lithium Ion Storage
© It still remains a major challenge to maintain the excellent lithium ion-storage performance of
layered double hydroxide (LDH) for lithium ion battery anodes at high current density and long
cycle times. Herein, a hybrid material was developed by anchoring Co/Ni/ LDH (CoNi-LDH) on the
surface of graphene oxide (GO) to obtain good structural stability. During the growth process, the
β phase with a low value (4.6 Å) of interlayer spacing was converted to the α phase with a high value
(8.1 Å) of interlayer spacing and the sheet became thinner than that of CoNi-LDH alone. X-ray
photoelectron spectroscopy (XPS) results demonstrate the formation of the CO chemical bond between
the ultra thin CoNi-LDH and GO, which increased the structural stability and effectively reduced the
charge-transfer impedance. As expected, the long-life cycling stability with a specific capacity of
440 mAh/g for 500 cycles was achieved at a high current density of 5 A/g, indicating that the
structural degradation of CoNi-LDH was successfully suppressed by the introduction of GO.
"""

X_predict = tfidf.transform([predict_title])
y_predict = model.predict(X_predict)
mlb.inverse_transform(y_predict)

[('CENG', 'CHEM', 'ENER', 'MATE')]

In [163]:
probabilities = model.predict_proba(X_predict)
{
    subject_area: float(probability[0][1])
    for subject_area, probability in zip(full_subject_names, probabilities)
}

{'Agricultural and Biological Sciences': 0.0028050570641667306,
 'Arts and Humanities': 4.116358407808053e-05,
 'Biochemistry, Genetics and Molecular Biology': 0.0019116982194806026,
 'Business, Management, and Accounting': 0.0006206657092079128,
 'Chemical Engineering': 0.7109534880361282,
 'Chemistry': 0.9988303944636827,
 'Computer Science': 0.00018565409264138062,
 'Decision Sciences': 2.987030427985418e-06,
 'Dentistry': 6.38205655958963e-05,
 'Earth and Planetary Sciences': 0.0035297231252291723,
 'Economics, Econometrics and Finance': 3.8870461896453145e-05,
 'Energy': 0.9951090569422418,
 'Engineering': 0.026897189669864898,
 'Environmental Science': 0.015459700395244634,
 'Health Professions': 3.251734231140753e-05,
 'Immunology and Microbiology': 0.001037387075794663,
 'Materials Science': 0.8725897048945477,
 'Mathematics': 4.89537277031425e-06,
 'Medicine': 0.003166992889501145,
 'Multidisciplinary': 0.007705735810783339,
 'Neuroscience': 3.1656192802864e-05,
 'Nursing': 1.