<a href="https://colab.research.google.com/github/NLP4/Investigating-the-Versatility-of-SPECTER/blob/main/Temporal%20weights%20classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#connection to Kaggle API
!pip install -q kaggle
!mkdir ~/.kaggle
from google.colab import files
files.upload()

In [None]:
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
#download the arxiv dataset
!kaggle datasets download -d Cornell-University/arxiv
!chmod 600 ~/arxiv-metadata-oai-snapshot.json
!unzip arxiv.zip

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
#Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly
data_file = '/content/arxiv-metadata-oai-snapshot.json'

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

metadata = get_metadata()


In [None]:
#extracting the year from the 'journal-ref' column and creating separate list for each column
metadata = get_metadata()
ids = []
titles = []
abstracts = []
categories = []
years = []
for paper in metadata:
    metaDict = json.loads(paper)
    try:
        try:
            year = int(metaDict['journal-ref'][-4:])    ### Example Format: "Phys.Rev.D76:013009,2007"
        except:
            year = int(metaDict['journal-ref'][-5:-1])    ### Example Format: "Phys.Rev.D76:013009,(2007)"
        
        ids.append(metaDict['id'])
        titles.append(metaDict['title'])
        abstracts.append(metaDict['abstract'])
        categories.append(metaDict['categories'])
        years.append(year)
    except:
        pass

In [None]:
#creating a data frame from the lists with the corresponding columns and the year included
df = pd.DataFrame({'id' : ids,'Title' : titles,'Abstract' : abstracts, 'Year' : years, 'Categories' : categories})
df.head()

In [None]:
min_year = df['Year'].min()
print(min_year)
max_year = df['Year'].max()
print(max_year)

In [None]:
#selecting only papers from year 1990 to year 2022 
df1 = df[(df['Year'] > 1990) & (df['Year'] < 2022)]

In [None]:
#renaming categories with more representable names
maskM = df1['Categories'].str.contains('math')
df1.loc[maskM, 'Categories'] = 'Maths'
phys = ['ph', 'mat', 'qc', 'hep', 'nlin', 'nucl', 'physics', 'quant']
pattern = '|'.join(phys)
maskP = df1['Categories'].str.contains(pattern)
df1.loc[maskP, 'Categories'] = 'Physics'
maskB = df1['Categories'].str.contains('bio')
df1.loc[maskB, 'Categories'] = 'Biology'
maskF = df1['Categories'].str.contains('fin')
df1.loc[maskF, 'Categories'] = 'Finance'
maskS = df1['Categories'].str.contains('stat')
df1.loc[maskS, 'Categories'] = 'Statistics'
maskCS = df1['Categories'].str.contains('cs')
df1.loc[maskCS, 'Categories'] = 'Computer Science'
maskE = df1['Categories'].str.contains('econ')
df1.loc[maskE, 'Categories'] = 'Economics'
maskES = df1['Categories'].str.contains('eess')
df1.loc[maskES, 'Categories'] = 'Electrical Engineering and Systems Science'

In [None]:
group_sizes = df1.groupby('Categories').size()
group_sizes

In [None]:
df1

In [None]:
#new dataset with equal number of papers from each category
n_samples = 1000
grouped = df1.groupby('Categories', group_keys=False).apply(lambda x: x.sample(n=min(n_samples, len(x)), random_state=42))
df2 = grouped.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
#randomly sampling papers
df3 = df2.sample(n=3000)


In [None]:
df3

In [None]:
#representation of the categories
df4 = df3
df4.drop('id', inplace=True, axis=1)
groups = df4.groupby('Categories').first()
groups

In [None]:
group_sizes = df3.groupby('Categories').size()
group_sizes

In [None]:
min_year = df3['Year'].min()
print(min_year)
max_year = df3['Year'].max()
print(max_year)

In [None]:
#distribution of number of papers in different years
import matplotlib.pyplot as plt
year_counts = df3['Year'].value_counts()

year_counts.plot(kind='bar')
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Distribution of papers in different years')
plt.ylim(0, 400)
plt.show()

In [None]:
%%capture
!pip install datasets==1.2.1
!pip install transformers
!pip install rouge_score

In [None]:
"""
#preprocessing function that removes removes URLs and email addresses, removes non-alphanumeric characters and converts the text to lowercase, tokenizes the text into words, removes stop words, and re-joins the words into a single string
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Preprocess the text data
def preprocess(text):
    # Remove any URLs or email addresses
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'http\S+', '', text)

    # Remove any non-alphanumeric characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()

    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove any stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]

    # Rejoin the words into a single string
    text = ' '.join(words)

    return text
"""

In [None]:
#using SPECTER to compute the embeddings
import torch
from transformers import AutoTokenizer, AutoModel


# Load the SPECTER model and tokenizer
model_name = "allenai/specter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Compute the embeddings in batches
batch_size = 12
num_batches = (len(df3) + batch_size - 1) // batch_size

embeddings = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df3))
    batch = df3.iloc[start_idx:end_idx]
    inputs = list(batch.apply(lambda row: f"{row['Title']} {tokenizer.sep_token} {row['Abstract']}", axis=1))

    # Tokenize the inputs and pad the sequences
    encoded_inputs = tokenizer(inputs, padding=True, truncation=True, max_length=512, return_tensors='pt')
    padded_inputs = {k: v.to(model.device) for k, v in encoded_inputs.items()}

    # Compute the embeddings for the batch
    with torch.no_grad():
        outputs = model(**padded_inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    embeddings.append(batch_embeddings)

# Concatenate the embeddings for all batches
embeddings = np.concatenate(embeddings, axis=0)

**No Weights In The Classification Process**

In [None]:
#splitting the data
from sklearn.model_selection import train_test_split

X_trainNW, X_testNW, y_trainNW, y_testNW = train_test_split(embeddings, df3['Categories'], test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import LinearSVC

clfNW = LinearSVC()
clfNW.fit(X_trainNW, y_trainNW)

In [None]:
y_predNW = clfNW.predict(X_testNW)

from sklearn.metrics import classification_report

print(classification_report(y_testNW, y_predNW))

**Harmonic Weights**

In [None]:
#weights with harmonic function
weightsH = np.zeros(len(df3))

max_year = df3['Year'].max()
min_year = df3['Year'].min()
center_year = (min_year + max_year) / 2
for i, year in enumerate(df3['Year']):
    weightsH[i] = 1 / (1 + abs(year - center_year))



In [None]:
#splitting the data
from sklearn.model_selection import train_test_split

X_trainH, X_testH, y_trainH, y_testH, weights_trainH, weights_testH = train_test_split(embeddings, df3['Categories'], weightsH, test_size=0.2, random_state=42)

In [None]:
#the weights array had to be reshaped to be able to multiply it with the training data
weights_trainH = weights_trainH.reshape((-1, 1))
X_train_weightedH = X_trainH*weights_trainH

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_train_weightedH, y_trainH)

In [None]:
X_test_weightedH = X_testH*weights_testH.reshape((-1, 1))
y_predH = clf.predict(X_test_weightedH)

from sklearn.metrics import classification_report

print(classification_report(y_testH, y_predH))

**Linear Weights** 


In [None]:
#weights with linear function
weightsL = np.zeros(len(df3))

max_year = df3['Year'].max()
min_year = df3['Year'].min()
for i, year in enumerate(df3['Year']):
    weightsL[i] = (max_year - year + 1) / (max_year - min_year + 1)

In [None]:
#splitting the data
from sklearn.model_selection import train_test_split

X_trainL, X_testL, y_trainL, y_testL, weights_trainL, weights_testL = train_test_split(embeddings, df3['Categories'], weightsL, test_size=0.2, random_state=42)

In [None]:
#the weights array had to be reshaped to be able to multiply it with the training data
weights_trainL = weights_trainL.reshape((-1, 1))
X_train_weightedL = X_trainL*weights_trainL

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_train_weightedL, y_trainL)

In [None]:
X_test_weightedL = X_testL*weights_testL.reshape((-1, 1))
y_pred = clf.predict(X_test_weightedL)

from sklearn.metrics import classification_report

print(classification_report(y_testL, y_predL))

**Exponential Weights**

In [None]:
decay_rate = 0.1

In [None]:
#weights with exponential function
current_year = df3['Year'].max()
weightsE = np.zeros(len(df3))
for i, year in enumerate(df3['Year']):
    weightsE[i] = np.exp(-decay_rate * (current_year - year))

In [None]:
#splitting the data
from sklearn.model_selection import train_test_split

X_trainE, X_testE, y_trainE, y_testE, weights_trainE, weights_testE = train_test_split(embeddings, df3['Categories'], weightsE, test_size=0.2, random_state=42)

In [None]:
#the weights array had to be reshaped to be able to multiply it with the training data
weights_trainE = weights_trainE.reshape((-1, 1))
X_train_weightedE = X_trainE*weights_trainE

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_train_weightedE, y_trainE)

In [None]:
X_test_weightedE = X_testE*weights_testE.reshape((-1, 1))
y_predE = clf.predict(X_test_weightedE)

from sklearn.metrics import classification_report

print(classification_report(y_testE, y_predE))