In [3]:
# Import required libraries
import pandas as pd
from sentence_transformers import SentenceTransformer
import re
import swifter

# Read the DataFrame from the Pickle file
df = pd.read_pickle('wikipedia_df.pkl')
df['text'] = df['text'].apply(lambda x: re.sub('\s+', ' ', str(x).replace('\n', ' ')).strip())



def chunk_text_by_sentence(text, max_chunk_size=100):
    # Split the text into sentences
    sentences = re.split('(?<=[.!?])\s+', text)
    
    chunks = []
    current_chunk = []
    current_chunk_size = 0
    
    for sentence in sentences:
        sentence_size = len(sentence.split(' '))
        
        # Check if adding the next sentence will exceed max_chunk_size
        if current_chunk_size + sentence_size > max_chunk_size:
            # If so, add the current_chunk to chunks and start a new chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = []
            current_chunk_size = 0
            
        # Add the sentence to the current chunk and update the chunk size
        current_chunk.append(sentence)
        current_chunk_size += sentence_size
        
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

df['text'] = df['text'].swifter.apply(chunk_text_by_sentence)
df

Pandas Apply:   0%|          | 0/20701 [00:00<?, ?it/s]

Unnamed: 0,id,url,title,text
0,1,https://simple.wikipedia.org/wiki/April,April,[April is the fourth month of the year in the ...
1,2,https://simple.wikipedia.org/wiki/August,August,[August (Aug.) is the eighth month of the year...
2,6,https://simple.wikipedia.org/wiki/Art,Art,[Art is a creative activity that expresses ima...
3,8,https://simple.wikipedia.org/wiki/A,A,[A or a is the first letter of the English alp...
4,9,https://simple.wikipedia.org/wiki/Air,Air,[Air refers to the Earth's atmosphere. Air is ...
...,...,...,...,...
205270,910075,https://simple.wikipedia.org/wiki/Timeline%20o...,Timeline of the 2022 Russian invasion of Ukraine,[This timeline is a list and may not ever be c...
205272,910092,https://simple.wikipedia.org/wiki/Saigrace%20P...,Saigrace Pokhrel,[Saigrace Pokharel (born 07 November 1987) is ...
205276,910132,https://simple.wikipedia.org/wiki/Even%20people,Even people,"[The Even (Even: эвэн, Russian: эвены) people ..."
205325,910294,https://simple.wikipedia.org/wiki/Repdigit,Repdigit,"[In recreational math, a repdigit or a monodig..."


In [4]:
# Define a function to count items in a list
def count_items(lst):
    return len(lst)

# Apply the function to count items and filter the DataFrame
filtered_df = df[df['text'].apply(lambda x: count_items(x) >= 5)]

# Count the rows that meet the condition
count = len(filtered_df)
# Initialize an empty list to collect new data
# Print the count
print(f"Number of rows with 5 or more items in the list: {count}")



Number of rows with 4 or more items in the list: 15747


In [5]:
import pandas as pd
import torch
from tqdm import tqdm



model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

# Check if a GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Assuming you have a model loaded on the GPU, you should move it to the GPU device as well
model.to(device)

# Create an empty list to store the results
new_data = []

# Iterate through the DataFrame with tqdm for progress tracking
for idx, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Processing rows"):
    doc_id = row['id']
    document = row['text']
       
    # Generate sentence embeddings for each chunk
    embeddings3d = model.encode(document)
        
    # Store the result into the new_data list
    new_data.append({'id': doc_id, 'embedding': embeddings3d})

# Create a new DataFrame with the embeddings
new_df = pd.DataFrame(new_data)

# Save the new DataFrame as a Pickle file
new_df.to_pickle('wikipedia_with_embeddings_df.pkl')


Using device: cuda


Processing rows: 100%|██████████| 15747/15747 [01:24<00:00, 187.07it/s]


Clustering the created embeddings

In [6]:
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
import umap.umap_ as umap
from tqdm import tqdm
import dask.dataframe as ddf

# Function to find medoid
def find_medoid(cluster_points):
    distance_matrix = pairwise_distances(cluster_points, metric='euclidean')
    medoid_index = np.argmin(distance_matrix.sum(axis=0))
    return medoid_index  # Returning index instead of the point itself

from sklearn.decomposition import PCA

# Function to process each row of the DataFrame
def process_row(row):
    embeddings = np.array(row['embedding'])
    n_components_value = 3  
    pca = PCA(n_components=n_components_value)
    embeddings3d = pca.fit_transform(embeddings)  

    n_clusters = 4  # Replace with your specific number
    clustering_model = AgglomerativeClustering(n_clusters=n_clusters)
    clustering_model.fit(embeddings3d)
    labels = clustering_model.labels_
    
    medoids = []
    cluster_sizes = []
    for i in range(n_clusters):
        cluster_points = embeddings3d[labels == i]  
        medoid_index = find_medoid(cluster_points)  
        original_medoid = embeddings[labels == i][medoid_index] 
        cluster_size = len(cluster_points) 
        
        medoids.append(original_medoid)
        cluster_sizes.append(cluster_size)

    return pd.Series({'medoids': np.array(medoids), 'cluster_sizes': cluster_sizes})

# Read DataFrame from Pickle
docs_df = pd.read_pickle('wikipedia_with_embeddings_df.pkl')
# Convert the Pandas DataFrame to a Dask DataFrame
dask_dataframe = ddf.from_pandas(docs_df, npartitions=20)



In [7]:
from dask.distributed import Client

client = Client(n_workers=2)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 2
Total threads: 32,Total memory: 63.76 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:61456,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 32
Started: Just now,Total memory: 63.76 GiB

0,1
Comm: tcp://127.0.0.1:61467,Total threads: 16
Dashboard: http://127.0.0.1:61468/status,Memory: 31.88 GiB
Nanny: tcp://127.0.0.1:61459,
Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-sfrlu81d,Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-sfrlu81d

0,1
Comm: tcp://127.0.0.1:61470,Total threads: 16
Dashboard: http://127.0.0.1:61471/status,Memory: 31.88 GiB
Nanny: tcp://127.0.0.1:61460,
Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-2lprpybb,Local directory: C:\Users\chris\AppData\Local\Temp\dask-scratch-space\worker-2lprpybb


In [8]:

# Use Dask's map_partitions to apply the process_row function and add medoids and cluster_sizes as new columns
result_dask_df = dask_dataframe.map_partitions(lambda df: df.apply(process_row, axis=1), meta=({'medoids': 'object', 'cluster_sizes': 'object'}))

# Compute to bring the Dask DataFrame back to Pandas
result_pd_df = result_dask_df.compute()

# Add the new columns to the original DataFrame
docs_df['medoids'] = result_pd_df['medoids']
docs_df['cluster_sizes'] = result_pd_df['cluster_sizes']

# Save the updated DataFrame to a Pickle file
docs_df.to_pickle("augmented_data.pkl")


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
