In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load & Preprocess Data

In [2]:
df = pd.read_csv("/kaggle/input/arxiv-scientific-research-papers-dataset/arXiv_scientific dataset.csv")
df.head()

Unnamed: 0,id,title,category,category_code,published_date,updated_date,authors,first_author,summary,summary_word_count
0,cs-9308101v1,Dynamic Backtracking,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. L. Ginsberg'],'M. L. Ginsberg',Because of their occasional need to return to ...,79
1,cs-9308102v1,A Market-Oriented Programming Environment and ...,Artificial Intelligence,cs.AI,8/1/93,8/1/93,['M. P. Wellman'],'M. P. Wellman',Market price systems constitute a well-underst...,119
2,cs-9309101v1,An Empirical Analysis of Search in GSAT,Artificial Intelligence,cs.AI,9/1/93,9/1/93,"['I. P. Gent', 'T. Walsh']",'I. P. Gent',We describe an extensive study of search in GS...,167
3,cs-9311101v1,The Difficulties of Learning Logic Programs wi...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['F. Bergadano', 'D. Gunetti', 'U. Trinchero']",'F. Bergadano',As real logic programmers normally use cut (!)...,174
4,cs-9311102v1,Software Agents: Completing Patterns and Const...,Artificial Intelligence,cs.AI,11/1/93,11/1/93,"['J. C. Schlimmer', 'L. A. Hermens']",'J. C. Schlimmer',To support the goal of allowing users to recor...,187


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136238 entries, 0 to 136237
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   id                  136238 non-null  object
 1   title               136238 non-null  object
 2   category            136238 non-null  object
 3   category_code       136238 non-null  object
 4   published_date      136238 non-null  object
 5   updated_date        136238 non-null  object
 6   authors             136238 non-null  object
 7   first_author        136238 non-null  object
 8   summary             136238 non-null  object
 9   summary_word_count  136238 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 10.4+ MB


In [4]:
df.isnull().sum()

id                    0
title                 0
category              0
category_code         0
published_date        0
updated_date          0
authors               0
first_author          0
summary               0
summary_word_count    0
dtype: int64

In [5]:
df.columns

Index(['id', 'title', 'category', 'category_code', 'published_date',
       'updated_date', 'authors', 'first_author', 'summary',
       'summary_word_count'],
      dtype='object')

In [6]:
#Drop irrelevant features
df = df.drop(['id', 'title','published_date','category_code',
       'updated_date', 'authors', 'first_author','summary_word_count'], axis=1)
df.head()

Unnamed: 0,category,summary
0,Artificial Intelligence,Because of their occasional need to return to ...
1,Artificial Intelligence,Market price systems constitute a well-underst...
2,Artificial Intelligence,We describe an extensive study of search in GS...
3,Artificial Intelligence,As real logic programmers normally use cut (!)...
4,Artificial Intelligence,To support the goal of allowing users to recor...


In [7]:
len(df["category"].value_counts())

138

In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [9]:
# Encode labels
label_encoder = LabelEncoder()
df["category_encoded"] = label_encoder.fit_transform(df["category"])
num_labels = len(label_encoder.classes_)

In [10]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize abstracts
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="tf")

# Convert dataset to tokenized form
X = tokenize_function(df["summary"].tolist())
y = df["category_encoded"].values

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
# Convert input tensors to NumPy arrays
X_input_ids = np.array(X["input_ids"])
X_attention_mask = np.array(X["attention_mask"])

In [12]:
# Split dataset
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    X_input_ids, X_attention_mask, y, test_size=0.2, random_state=42
)

In [13]:
# Convert to TensorFlow tensors
train_dataset = tf.data.Dataset.from_tensor_slices(
    ({"input_ids": X_train_ids, "attention_mask": X_train_mask}, y_train)
).batch(64)

test_dataset = tf.data.Dataset.from_tensor_slices(
    ({"input_ids": X_test_ids, "attention_mask": X_test_mask}, y_test)
).batch(64)

#  Load Pre-trained BERT Model

In [14]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Compile & Train Model

In [15]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  106122    
                                                                 
Total params: 109588362 (418.05 MB)
Trainable params: 109588362 (418.05 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
# Train the model
history = model.fit(train_dataset, validation_data=test_dataset, epochs= 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
# Get raw model predictions (logits)
y_pred_logits = model.predict(test_dataset).logits



In [25]:
# Convert logits to class predictions
y_pred = tf.argmax(y_pred_logits, axis=1).numpy()  # Picks the class with the highest score

In [27]:
from sklearn.metrics import confusion_matrix, classification_report

In [32]:
# Print classification report
print("Classification Report:\n", classification_report(y_true, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00        54
           6       0.00      0.00      0.00         2
           7       0.58      0.68      0.63      2594
           8       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         4
          11       0.50      0.03      0.05        37
          13       0.00      0.00      0.00        13
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00        13
          20       0.00      0.00      0.00         3
          22       0.00      0.00      0.00        51
          23       0.60      0.65      0.62       145
          24       0.85      0.93      0.89      5042
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
