# Article Selection Model Comparison: Logistic Regression vs DistilBERT

In [1]:
# import libraries
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score

from transformers import DistilBertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\redinger\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Import csv files
file_path = 'hypopthalmichthys_selected_articles.csv'
carp_file = pd.read_csv(file_path, dtype = str)

# Create new encoding for category column
label_encoder = LabelEncoder()
carp_file['encoding'] = label_encoder.fit_transform(carp_file['categories'])


#### Data Preparation & Text Processing

In [6]:
# Create copy
carp_clean = carp_file.copy()

# Create new encoding for category column
label_encoder = LabelEncoder()
carp_clean['encoding'] = label_encoder.fit_transform(carp_clean['categories'])

# Balance encoding categories
def balance_df(df):

    # Count the number of rows in each category
    category_counts = df['encoding'].value_counts()

    # Find the minority category
    minority_category = category_counts.idxmin()

    # Get the size of the minority category
    minority_category_size = category_counts[minority_category]

    # Sample rows from the majority category to match the size of the minority category
    majority_category_rows = df[df['encoding'] != minority_category]
    balanced_majority_category_rows = majority_category_rows.sample(n=minority_category_size, random_state=42)

    # Get the minority category rows
    minority_category_rows = df[df['encoding'] == minority_category]

    # Concatenate the minority and balanced majority category rows
    balanced_df = pd.concat([minority_category_rows, balanced_majority_category_rows])

    return(balanced_df)
    

# Apply function
carp_clean = balance_df(carp_clean)

# Concatenate Title and Abstract
carp_clean["TitleAbstract"] = carp_clean["Title"] + ' ' + carp_clean["Abstract"]


In [7]:
# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Create lemmenizer
lemmatizer = WordNetLemmatizer()

# Function to remove stopwords from text
def remove_stopwords(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Filter out stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    # Reconstruct the text without stopwords
    text_without_stopwords = ' '.join(filtered_tokens)
    return text_without_stopwords

    
# Function to clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'[^\w\s]', '', text)
    text = remove_stopwords(text)
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    # text = re.sub(r'\d+', '', text)
    return text
    
carp_clean['Title'] = carp_clean['Title'].apply(clean_text)
carp_clean['Abstract'] = carp_clean['Abstract'].apply(clean_text)
carp_clean['TitleAbstract'] = carp_clean['TitleAbstract'].apply(clean_text)


#### Train-Test Splitting

In [8]:
# Split the data into training and test sets
X_train_title, X_test_title, y_train, y_test = train_test_split(carp_clean['Title'], carp_clean['encoding'], test_size=0.2, random_state=42)
X_train_abstract, X_test_abstract, _, _ = train_test_split(carp_clean['Abstract'], carp_clean['encoding'], test_size=0.2, random_state=42)
X_train_combined, X_test_combined, _, _ = train_test_split(carp_clean['TitleAbstract'], carp_clean['encoding'], test_size=0.2, random_state=42)


#### Tokenization for DistilBert

In [9]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [13]:
X_train_title

510    Extraction property acidsoluble collagen pepsi...
22     Histological study sex differentiation bighead...
571    facilitation native bluegill sunfish invasive ...
46     Filterfeeding fish Hypophthalmichthys molitrix...
153    Demographic rate variability bighead silver ca...
                             ...                        
106    Nonnative silver carp fail generalize behavior...
547    Acquirement hrp conjunct igg antiigms widely c...
531    Novel colorimetric film based starchpolyvinyl ...
416    Lengthweight relationship five native fish spe...
102    Identification micrornas silver carp Hypophtha...
Name: Title, Length: 372, dtype: object

In [11]:
X_train_title = X_train_title.to_frame(name='text')  # Replace 'text' with the appropriate column name
X_test_title = X_test_title.to_frame(name='text')

pandas.core.series.Series

In [12]:
# Convert datasets to tokenized format
train_title_dataset = Dataset.from_pandas(X_train_title.to_frame)
test_title_dataset = Dataset.from_pandas(X_test_title.to_frame)

AttributeError: 'function' object has no attribute 'columns'

In [None]:
def tokenize_data(examples):
    return tokenizer(examples["cleaned_text"], truncation=True)

tokenized_title_train = train_title_dataset.map(tokenize_data, batched=True)
tokenized_title_test = test_title_dataset.map(tokenize_data, batched=True)

#### Training DistilBert Model

In [None]:
# Load pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Prepare data collator for padding sequences
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    logging_strategy = "epoch"
)

# Define Trainer object for training the model
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_title_train,
    eval_dataset = tokenized_title_test,
    tokenizer = tokenizer,
    data_collator = data_collator,
)

# Train the model
trainer.train()

#### Evaluating Model

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print(eval_results)

In [None]:
# Make predictions
predictions = trainer.predict(test_dataset)

# Extract predictions and true labels
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

# Calculate detailed metrics
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

# Print detailed metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')