In [1]:
# Import required libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import swifter

# Read the DataFrame from the Pickle file
df = pd.read_csv('cleaned_dataset_10k.csv')

df

Unnamed: 0,id,title,doi,categories,unique_primary_category,full_text
0,hep-ph/0610334,Weak interaction corrections to hadronic top q...,10.1103/PhysRevD.74.113005,hep-ph,hep-ph,arXiv:hep-ph/0610334v2 30 Nov 2006\nPITHA 06/...
1,2104.06416,Next-to-leading non-global logarithms in QCD,,hep-ph,hep-ph,"Prepared for submission to JHEP\nOUTP-21-08P, ..."
2,hep-ph/9606269,$K_L \to \pi^o \nu \overline{\nu}$ in Extended...,10.1103/PhysRevD.54.4393,hep-ph,hep-ph,arXiv:hep-ph/9606269v3 27 Jun 1996\nWM-96-105...
3,hep-ph/9811382,A critical phenomenological study of inclusive...,10.1007/s100529900018,hep-ph,hep-ph,arXiv:hep-ph/9811382v1 18 Nov 1998\nA CRITICA...
4,1304.2781,Progress in the NNPDF global analysis,,hep-ph,hep-ph,arXiv:1304.2781v1 [hep-ph] 9 Apr 2013\nEdinb...
...,...,...,...,...,...,...
9495,2308.09211,On the Value of Information Structures in Stoc...,,econ.TH,econ,arXiv:2308.09211v1 [econ.TH] 17 Aug 2023\nOn...
9496,2212.03704,Semiparametric Distribution Regression with In...,,econ.EM,econ,Semiparametric Distribution Regression with\nI...
9497,2309.09299,Bounds on Average Effects in Discrete Choice P...,,econ.EM,econ,Bounds on Average Effects in\nDiscrete Choice ...
9498,1910.11154,Necessary and sufficient condition for equilib...,,econ.TH,econ,arXiv:1910.11154v2 [econ.TH] 28 Oct 2019\nNE...


In [2]:

df['full_text'] = df['full_text'].apply(lambda x: re.sub('\s+', ' ', str(x).replace('\n', ' ')).strip())



def chunk_text_by_sentence(text, max_chunk_size=100):
    # Split the text into sentences
    sentences = re.split('(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = []
    current_chunk_size = 0
    
    for sentence in sentences:
        sentence_size = len(sentence.split(' '))
        
        # Check if adding the next sentence will exceed max_chunk_size
        if current_chunk_size + sentence_size > max_chunk_size:
            # If so, add the current_chunk to chunks and start a new chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_chunk_size = 0
            
        # Add the sentence to the current chunk and update the chunk size
        current_chunk.append(sentence)
        current_chunk_size += sentence_size
        
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

df['full_text'] = df['full_text'].swifter.apply(chunk_text_by_sentence)

Pandas Apply:   0%|          | 0/9500 [00:00<?, ?it/s]

In [3]:
# Define a function to count items in a list
def count_items(lst):
    return len(lst)

# Apply the function to count items and filter the DataFrame
filtered_df = df[df['full_text'].apply(lambda x: count_items(x) >= 5)]
count = len(filtered_df)

print(f"Number of rows with 4 or more items in the list: {count}")



Number of rows with 4 or more items in the list: 9424


In [4]:
import pandas as pd
import torch
from tqdm import tqdm



model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

# Check if a GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming you have a model loaded on the GPU, you should move it to the GPU device as well
model.to(device)

# Create an empty list to store the results
new_data = []

# Iterate through the DataFrame with tqdm for progress tracking
for idx, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Processing rows"):
    doc_id = row['id']
    document = row['full_text']
       
    # Generate sentence embeddings for each chunk
    embeddings3d = model.encode(document)
        
    # Store the result into the new_data list
    new_data.append({'id': doc_id, 'embedding': embeddings3d})

# Create a new DataFrame with the embeddings
new_df = pd.DataFrame(new_data)

# Save the new DataFrame as a Pickle file
new_df.to_pickle('arxiv_with_embeddings_df_10k.pkl')


Using device: cuda


Processing rows: 100%|██████████| 9424/9424 [08:00<00:00, 19.60it/s]


Clustering the created embeddings

In [5]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
import umap.umap_ as umap
from tqdm import tqdm
import dask.dataframe as ddf

# Function to find medoid
def find_medoid(cluster_points):
    distance_matrix = pairwise_distances(cluster_points, metric='euclidean')
    medoid_index = np.argmin(distance_matrix.sum(axis=0))
    return medoid_index  # Returning index instead of the point itself

from sklearn.decomposition import PCA

# Function to process each row of the DataFrame
def process_row(row):
    embeddings = np.array(row['embedding'])
    n_components_value = 3
    pca = PCA(n_components=n_components_value)
    embeddings3d = pca.fit_transform(embeddings)  

    n_clusters = 4  # Replace with your specific number
    clustering_model = AgglomerativeClustering(n_clusters=n_clusters)
    clustering_model.fit(embeddings3d)
    labels = clustering_model.labels_
    
    medoids = []
    cluster_sizes = []
    for i in range(n_clusters):
        cluster_points = embeddings3d[labels == i] 
        medoid_index = find_medoid(cluster_points) 
        original_medoid = embeddings[labels == i][medoid_index]  
        cluster_size = len(cluster_points) 
        
        medoids.append(original_medoid)
        cluster_sizes.append(cluster_size)

    return pd.Series({'medoids': np.array(medoids), 'cluster_sizes': cluster_sizes})

# Read DataFrame from Pickle
docs_df = pd.read_pickle('arxiv_with_embeddings_df_10k.pkl')
# Convert the Pandas DataFrame to a Dask DataFrame
dask_dataframe = ddf.from_pandas(docs_df, npartitions=20)



In [6]:
from dask.distributed import Client

client = Client(n_workers=2)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 32,Total memory: 63.76 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:51448,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 32
Started: Just now,Total memory: 63.76 GiB

0,1
Comm: tcp://127.0.0.1:51459,Total threads: 16
Dashboard: http://127.0.0.1:51461/status,Memory: 31.88 GiB
Nanny: tcp://127.0.0.1:51451,
Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-wj42fgkv,Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-wj42fgkv

0,1
Comm: tcp://127.0.0.1:51460,Total threads: 16
Dashboard: http://127.0.0.1:51462/status,Memory: 31.88 GiB
Nanny: tcp://127.0.0.1:51452,
Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-2ij9irf8,Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-2ij9irf8


In [7]:

# Use Dask's map_partitions to apply the process_row function and add medoids and cluster_sizes as new columns
result_dask_df = dask_dataframe.map_partitions(lambda df: df.apply(process_row, axis=1), meta=({'medoids': 'object', 'cluster_sizes': 'object'}))

# Compute to bring the Dask DataFrame back to Pandas
result_pd_df = result_dask_df.compute()

# Add the new columns to the original DataFrame
docs_df['medoids'] = result_pd_df['medoids']
docs_df['cluster_sizes'] = result_pd_df['cluster_sizes']

# Save the updated DataFrame to a Pickle file
docs_df.to_pickle("augmented_data_10k.pkl")


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
