In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch

zot_df = pd.read_csv('./data/zot_clean.csv')

# Convert the date columns to datetime objects
date_columns = ["Date", "Date Added", "Date Modified"]
for col in date_columns:
    zot_df[col] = pd.to_datetime(zot_df[col], errors='coerce')

zot_df["Publication Year"] = zot_df["Publication Year"].astype("Int64")
zot_df["Hearts"] = zot_df["Hearts"].astype("Int64")

zot_df["Manual Tags"] = zot_df["Manual Tags"].fillna("").str.split(";").apply(lambda tags: [tag.strip() for tag in tags])
zot_df["Abstract Note"] = zot_df["Abstract Note"].fillna("")

#dropping rows without title or abstract
zot_df = zot_df.dropna(subset=['Title'])


# embedding model
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')

#load the adapter(s) as per the required task, provide an identifier for the adapter in load_as argument and activate it
model.load_adapter("allenai/specter2", source="hf", load_as="proximity", set_active=True)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

'proximity'

In [3]:
zot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Item Type          924 non-null    object        
 1   Publication Year   882 non-null    Int64         
 2   Author             822 non-null    object        
 3   Title              924 non-null    object        
 4   Publication Title  343 non-null    object        
 5   DOI                304 non-null    object        
 6   Url                836 non-null    object        
 7   Abstract Note      924 non-null    object        
 8   Date               75 non-null     datetime64[ns]
 9   Date Added         924 non-null    datetime64[ns]
 10  Date Modified      924 non-null    datetime64[ns]
 11  Volume             246 non-null    float64       
 12  Publisher          408 non-null    object        
 13  Language           723 non-null    object        
 14  Library Ca

In [5]:
#23min hour for 930 items
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm  # Import tqdm for progress bar

# Assuming zot_df is your DataFrame
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoModel.from_pretrained('allenai/specter2_base')

# Function to process a single batch
def process_batch(batch):
	# Concatenate title and abstract with the tokenizer's separator token
	text_batch = batch["Title"] + tokenizer.sep_token + batch["Abstract Note"]
	text_batch = list(text_batch)
	
	# Tokenize the text batch
	inputs = tokenizer(text_batch, padding=True, truncation=True,
					   return_tensors="pt", return_token_type_ids=False, max_length=512)
	
	# Perform inference without gradient calculation
	with torch.no_grad():
		output = model(**inputs)
	
	# Extract embeddings from the output
	embeddings = output.last_hidden_state[:, 0, :]
	return embeddings

# Process the DataFrame in chunks
batch_size = 10  # Set batch size (adjust based on memory availability)
amount_batches = 8
amount_batches = len(zot_df) // batch_size + 1
embeddings_list = []

# Iterate over the DataFrame in chunks with progress bar
for start in tqdm(range(0, batch_size*amount_batches, batch_size), total=amount_batches):
	end = min(start + batch_size, len(zot_df))
	batch = zot_df.iloc[start:end]
	embeddings = process_batch(batch)
	embeddings_list.append(embeddings)

# Concatenate all embeddings
all_embeddings = torch.cat(embeddings_list, dim=0)

all_embeddings = torch.cat(embeddings_list, dim=0)
embeddings_df = pd.DataFrame(all_embeddings.numpy())
embeddings_df.to_csv('data/zot_embeddings.csv', index=False)

100%|██████████| 93/93 [22:13<00:00, 14.34s/it]
