In [48]:
# import libraries
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import torch

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jredi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Import csv files
import os
os.chdir('..')
df = pd.read_csv(r'data/processed/train-test.csv', dtype = str)
df.head()

Unnamed: 0,Author,Year,Title,Journal Name,Volume,Issue,Pages,Abstract,categories,TitleAbstract
0,"Aldridge, C. A., and E. C. Boone",2022,Simple models to quickly estimate the probable...,River Research and Applications,38,6.0,1154-1166,Species distribution models provide biologists...,1,simpl model quick estim probabl rang datalimit...
1,"Banan, A., A. Nasiri, and A. Taheri-Garavand",2020,Deep learning-based appearance features extrac...,Aquacultural Engineering,89,,,Fish species identification is vital for aquac...,1,deep learningbas appear featur extract autom c...
2,"Barnes, M. A., W. L. Chadderton, C. L. Jerde, ...",2021,Environmental conditions influence edna partic...,Environmental DNA,3,3.0,643-653,Knowledge about the size of environmental DNA ...,1,environment condit influenc edna particl size ...
3,"Behera, B. K., A. K. Bera, P. Paria, A. Das, P...",2018,Identification and pathogenicity of plesiomona...,Aquaculture,493,,314-318,Plesiomonas shigelloides was isolated from dis...,1,identif pathogen plesiomona shigelloid silver ...
4,"Borland, L. K., C. J. Mulcahy, B. A. Bennie, D...",2020,Using markov chains to quantitatively assess m...,Natural Resource Modeling,33,4.0,,Natural resource managers use barriers to dete...,1,use markov chain quantit assess movement patte...


In [3]:
# Create balanced datasets for article selection
balanced_dfs = []

# Count the number of rows in each category
category_counts = df['categories'].value_counts()

# Find the minority category
minority_category = category_counts.idxmin()

# Get the size of the minority category
minority_category_size = category_counts[minority_category]

# Sample rows from the majority category to match the size of the minority category
majority_category_rows = df[df['categories'] != minority_category]
balanced_majority_category_rows = majority_category_rows.sample(n=minority_category_size, random_state=42)

# Get the minority category rows
minority_category_rows = df[df['categories'] == minority_category]

# Concatenate the minority and balanced majority category rows
balanced_df = pd.concat([minority_category_rows, balanced_majority_category_rows])

balanced_dfs.append(balanced_df)

In [4]:
balanced_df["categories"].value_counts()

categories
1    233
0    233
Name: count, dtype: int64

In [87]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(balanced_df['TitleAbstract'], balanced_df['categories'], test_size = 0.2, random_state = 0)

In [88]:
# Load the logistic regression model
with open('models/ml_model.pkl', 'rb') as f:
    lr_model, vectorizer = pickle.load(f)

In [89]:
# Vectorize
# vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [90]:
# Analyze output
y_pred = lr_model.predict(X_test_vec)
y_pred = y_pred.astype(str)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average ='weighted', zero_division = 0)
recall = recall_score(y_test, y_pred, average ='weighted')
f1 = f1_score(y_test, y_pred, average ='weighted')
lr_metrics = ['LR', accuracy, precision, recall, f1]

In [65]:
# Prepare for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
X_train_token = tokenizer(X_train.tolist(), padding='max_length', truncation=True, max_length=512, return_tensors='pt')
X_test_token = tokenizer(X_test.tolist(), padding='max_length', truncation=True, max_length=512, return_tensors='pt')



loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\jredi/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at C:\Users\jredi/.cache\huggingface\transformers\0e1bbfda7f6

In [68]:
# Convert labels to integers and then to lists
y_train = y_train.astype(int).tolist()
y_test = y_test.astype(int).tolist()

In [69]:
# Convert tokenized data to Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': X_train_token['input_ids'],
    'attention_mask': X_train_token['attention_mask'],
    'labels': y_train
})

test_dataset = Dataset.from_dict({
    'input_ids': X_test_token['input_ids'],
    'attention_mask': X_test_token['attention_mask'],
    'labels': y_test
})

In [53]:
# Encode abstracts
bert_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\jredi/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\jredi/.cache\huggingface\transfor

In [54]:
# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [70]:
# Define training arguments
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch"
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [71]:
# Define Trainer object for training the model
trainer = Trainer(
    model = bert_model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer = tokenizer,
    data_collator = data_collator,
)

In [91]:
# Train and evaluate the model
trainer.train()
eval_results = trainer.evaluate()

***** Running training *****
  Num examples = 372
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 235


KeyboardInterrupt: 

In [77]:
eval_results = trainer.evaluate()

***** Running Evaluation *****
  Num examples = 94
  Batch size = 8





[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




[A[A[A[A[A




19it [03:33, 11.23s/it]


In [78]:
eval_results


{'eval_loss': 0.6950927376747131,
 'eval_runtime': 55.6613,
 'eval_samples_per_second': 1.689,
 'eval_steps_per_second': 0.216,
 'epoch': 5.0}

In [81]:
# Evaluate the best models on the test set
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average ='weighted', zero_division = 0)
recall = recall_score(y_test, y_pred, average ='weighted')
f1 = f1_score(y_test, y_pred, average ='weighted')
bert_metrics = ['distilBERT', accuracy, precision, recall, f1]


***** Running Prediction *****
  Num examples = 94
  Batch size = 8
24it [02:04,  4.33s/it]                        

In [92]:
column_names = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1']
result_df = pd.DataFrame([lr_metrics, bert_metrics], columns = column_names)
print(result_df)

        Model  Accuracy  Precision    Recall        F1
0          LR  0.755319   0.757025  0.755319  0.755402
1  distilBERT  0.478723   0.229176  0.478723  0.309965
