this is run in a google colab notebook

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch

zot_df = pd.read_csv('zot_clean.csv')

# Convert the date columns to datetime objects
date_columns = ["Date", "Date Added", "Date Modified"]
for col in date_columns:
    zot_df[col] = pd.to_datetime(zot_df[col], errors='coerce')

zot_df["Publication Year"] = zot_df["Publication Year"].astype("Int64")
zot_df["Hearts"] = zot_df["Hearts"].astype("Int64")

zot_df["Manual Tags"] = zot_df["Manual Tags"].fillna("").str.split(";").apply(lambda tags: [tag.strip() for tag in tags])
zot_df["Abstract Note"] = zot_df["Abstract Note"].fillna("")

#dropping rows without title or abstract
zot_df = zot_df.dropna(subset=['Title'])


# embedding model
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')

#load the adapter(s) as per the required task, provide an identifier for the adapter in load_as argument and activate it
model.load_adapter("allenai/specter2", source="hf", load_as="proximity", set_active=True)

Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
  state_dict = torch.load(weights_file, map_location="cpu")


'proximity'

In [62]:
zot_df["Hearts"].fillna(0).astype(str).value_counts()


Hearts
0    491
1    110
2     48
3     43
4     17
5     10
Name: count, dtype: int64

In [16]:
zot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719 entries, 0 to 718
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Item Type          719 non-null    object        
 1   Publication Year   646 non-null    Int64         
 2   Author             654 non-null    object        
 3   Title              719 non-null    object        
 4   Publication Title  279 non-null    object        
 5   DOI                215 non-null    object        
 6   Url                651 non-null    object        
 7   Abstract Note      670 non-null    object        
 8   Date               53 non-null     datetime64[ns]
 9   Date Added         719 non-null    datetime64[ns]
 10  Date Modified      719 non-null    datetime64[ns]
 11  Volume             205 non-null    float64       
 12  Publisher          325 non-null    object        
 13  Language           584 non-null    object        
 14  Library Ca

In [14]:
zot_df.columns

Index(['Item Type', 'Publication Year', 'Author', 'Title', 'Publication Title',
       'DOI', 'Url', 'Abstract Note', 'Date', 'Date Added', 'Date Modified',
       'Volume', 'Publisher', 'Language', 'Library Catalog', 'Notes',
       'Manual Tags', 'Hearts'],
      dtype='object')

In [10]:
#1 hour for 720 items
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm  # Import tqdm for progress bar

# Assuming zot_df is your DataFrame
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoModel.from_pretrained('allenai/specter2_base')

# Function to process a single batch
def process_batch(batch):
	# Concatenate title and abstract with the tokenizer's separator token
	text_batch = batch["Title"] + tokenizer.sep_token + batch["Abstract Note"]
	text_batch = list(text_batch)
	
	# Tokenize the text batch
	inputs = tokenizer(text_batch, padding=True, truncation=True,
					   return_tensors="pt", return_token_type_ids=False, max_length=512)
	
	# Perform inference without gradient calculation
	with torch.no_grad():
		output = model(**inputs)
	
	# Extract embeddings from the output
	embeddings = output.last_hidden_state[:, 0, :]
	return embeddings

# Process the DataFrame in chunks
batch_size = 10  # Set batch size (adjust based on memory availability)
amount_batches = 8
amount_batches = len(zot_df) // batch_size + 1
embeddings_list = []

# Iterate over the DataFrame in chunks with progress bar
for start in tqdm(range(0, batch_size*amount_batches, batch_size), total=amount_batches):
	end = min(start + batch_size, len(zot_df))
	batch = zot_df.iloc[start:end]
	embeddings = process_batch(batch)
	embeddings_list.append(embeddings)

# Concatenate all embeddings
all_embeddings = torch.cat(embeddings_list, dim=0)

all_embeddings = torch.cat(embeddings_list, dim=0)
embeddings_df = pd.DataFrame(all_embeddings.numpy())
embeddings_df.to_csv('zot_embeddings.csv', index=False)

100%|██████████| 72/72 [54:43<00:00, 45.60s/it]


In [11]:
import pacmap


torch.Size([719, 768])


In [16]:
pac5 = pacmap.PaCMAP(
    n_components=2,
    n_neighbors=5,
    MN_ratio=0.5,
    FP_ratio=2.0,
    distance="angular",
    random_state=3)

pac7 = pacmap.PaCMAP(
    n_components=2,
    n_neighbors=7,
    MN_ratio=0.5,
    FP_ratio=2.0,
    distance="angular",
    random_state=3)



In [17]:
# 10 sec for 720
zot_pac5 = pac5.fit_transform(all_embeddings.numpy())
zot_pac7 = pac7.fit_transform(all_embeddings.numpy())

In [72]:
import thisnotthat as tnt
import panel as pn
import networkx

In [88]:
print(zot_df.columns)

Index(['ItemType', 'Publication Year', 'Author', 'Title', 'Publication Title',
       'DOI', 'Url', 'AbstractNote', 'Date', 'AccessDate', 'Date Modified',
       'Volume', 'Publisher', 'Language', 'Library Catalog', 'Notes',
       'ManualTags', 'Hearts'],
      dtype='object')


In [87]:
df = zot_df
word_map = zot_pac7
df["ManualTags"] = df["ManualTags"].apply(lambda x: ";".join(x) )

In [73]:
basic_plot1 = tnt.BokehPlotPane(
    zot_pac7,
    hover_text=zot_df["Title"] + ", like:" + zot_df["Hearts"].fillna(0).astype(str),
    marker_size= zot_df["Hearts"].fillna(1)/30,
    show_legend=True,
    legend_location="top_right",
    sizing_mode='stretch_both',
    min_point_size=0.001,
    max_point_size=0.05,
    )

basic_plot2 = tnt.BokehPlotPane(
    zot_pac5,
    hover_text=zot_df["Title"] + ", like:"+ zot_df["Hearts"].fillna(0).astype(str),
    marker_size= zot_df["Hearts"].fillna(1)/30,
    show_legend=True,
    legend_location="top_right",
    sizing_mode='stretch_both',
    min_point_size=0.001,
    max_point_size=0.05,
    )


data_view = tnt.SimpleDataPane(
    zot_df,
    sizing_mode="stretch_both", max_rows=400, max_cols=50)

basic_plot2.link(
    basic_plot1,
    selected="selected",
    bidirectional=True
)

data_view.link(
    basic_plot1,
    selected="selected",
    bidirectional=True
)



column2 = pn.Column(basic_plot1, name="pac7")
column3 = pn.Column(basic_plot2, name="pac5")
column4 = pn.Column(data_view, name="data")

search = tnt.SearchWidget(zot_df)
search.link_to_plot(basic_plot1)

app = pn.Tabs(
    column2,
    column3,
    column4,
    search)

simplesearch = tnt.SimpleSearchWidget(basic_plot1, raw_dataframe=df)





In [89]:
label_layers = tnt.MetadataLabelLayers(
    np.array(embeddings),
    np.array(pac5),
    zot_df["ManualTags"],
    hdbscan_min_cluster_size=5,
    hdbscan_min_samples=5,
    contamination=1e-6,
    min_clusters_in_layer=3,
    vector_metric="cosine",
    cluster_distance_threshold=0.0,
    random_state=0,
)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(


AttributeError: 'Series' object has no attribute 'select_dtypes'

In [90]:

label_layers = tnt.MetadataLabelLayers(
    np.array(embeddings),
    np.array(pac5),
    df["ManualTags"],
    hdbscan_min_cluster_size=2,
    hdbscan_min_samples=2,
    contamination=1e-6,
    min_clusters_in_layer=5,
    vector_metric="cosine",
    cluster_distance_threshold=0.0,
    random_state=0,
    items_per_label=2
)

basic_plot2.add_cluster_labels(
    label_layers, text_size_scale=100, text_layer_scale_factor=2.0)



  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(


IndexError: too many indices for array: array is 0-dimensional, but 1 were indexed

In [68]:
app = pn.Column(simplesearch, app)
pn.serve(app)

Launching server at http://localhost:60472


<panel.io.server.Server at 0x1e9520d5b20>