In [5]:
!hostname

frodo


In [6]:
!pip install memory-profiler
!pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [7]:
import numpy as np
from numba import jit, prange
from sklearn.cluster import MiniBatchKMeans
import psutil
from tqdm import tqdm
import gc
import time
import pandas as pd
from scipy.sparse import csr_matrix
import os

@jit(nopython=True, parallel=True, fastmath=True)
def compute_cpu_similarity(features1, features2, cluster_weights1, cluster_weights2):
    n, m = features1.shape[0], features2.shape[0]
    result = np.empty((n, m), dtype=np.float32)

    for i in prange(n):
        f1 = features1[i]
        c1 = cluster_weights1[i]
        for j in prange(m):
            f2 = features2[j]
            c2 = cluster_weights2[j]
            
            # Let's compute similarities with minimal memory usage
            intersection_orig = 0.0
            union_orig = 0.0
            for k in range(len(f1)):
                min_val = min(f1[k], f2[k])
                max_val = max(f1[k], f2[k])
                intersection_orig += min_val
                union_orig += max_val
            
            orig_sim = intersection_orig / union_orig if union_orig > 0 else 0.0

            intersection_clust = 0.0
            union_clust = 0.0
            for k in range(len(c1)):
                min_val = min(c1[k], c2[k])
                max_val = max(c1[k], c2[k])
                intersection_clust += min_val
                union_clust += max_val
            
            clust_sim = intersection_clust / union_clust if union_clust > 0 else 0.0
            result[i, j] = 0.5 * (orig_sim + clust_sim)

    return result

class MemoryEfficientSimilarityComputer:
    def __init__(self, max_chunk_size=100, B=100):
        self.max_chunk_size = max_chunk_size
        self.B = B  

    def estimate_memory_usage(self, n_samples, n_features):
        bytes_per_float = 4 
        estimated_memory = (n_samples * n_features * bytes_per_float * 3) / (1024 ** 2) 
        return estimated_memory

    def get_optimal_chunk_size(self, n_samples, n_features):
        available_memory = psutil.virtual_memory().available / (1024 ** 2)
        memory_per_sample = self.estimate_memory_usage(1, n_features)
        optimal_chunk_size = int(min(
            self.max_chunk_size,
            (available_memory * 0.5) / memory_per_sample  
        ))
        return max(1, optimal_chunk_size)

    def compute_similarity_matrix(self, data_iterator, total_samples, n_features, m=0.8, k=3):
        q = int(total_samples * m)
        
        # Let's calculate batch size based on B parameter
        batch_size = max(1, q // self.B)
        
        # Sample indices
        indices = np.random.choice(total_samples, q, replace=False)
        indices.sort()  
        
        # Let's initialize storage for sampled data
        subsampled_data = np.zeros((q, n_features), dtype=np.float32)
        
        # Let's load sampled data
        print("Loading sampled data...")
        current_idx = 0
        sample_idx = 0
        for chunk in data_iterator:
            chunk_array = chunk.to_numpy(dtype=np.float32)
            chunk_size = len(chunk_array)
            chunk_end = current_idx + chunk_size
            
            # Let's find indices that fall within this chunk
            while sample_idx < len(indices) and indices[sample_idx] < chunk_end:
                relative_idx = indices[sample_idx] - current_idx
                if relative_idx >= 0:
                    subsampled_data[sample_idx] = chunk_array[relative_idx]
                sample_idx += 1
            
            current_idx = chunk_end
            if sample_idx >= len(indices):
                break
        
        # Let's clear memory
        gc.collect()
        
        # Let's perform clustering with B batches
        print(f"Performing MiniBatch k-means clustering with {self.B} batches...")
        kmeans = MiniBatchKMeans(
            n_clusters=k,
            batch_size=batch_size,
            n_init='auto',
            random_state=42,
            max_iter=self.B
        )
        cluster_labels = kmeans.fit_predict(subsampled_data)
        distances = kmeans.transform(subsampled_data)
        
        # Let's compute cluster weights
        cluster_weights = np.exp(-distances / 0.1)
        cluster_weights /= cluster_weights.sum(axis=1, keepdims=True)
        
        # Let's clear memory
        del distances
        gc.collect()
        
        # Let's initialize result matrix
        similarity_matrix = np.zeros((q, q), dtype=np.float32)
        
        # Let's compute similarities in chunks
        chunk_size = self.get_optimal_chunk_size(q, n_features)
        n_chunks = (q + chunk_size - 1) // chunk_size
        print(f"Processing similarities in {n_chunks} chunks...")
        
        for i in tqdm(range(0, q, chunk_size)):
            end_idx = min(i + chunk_size, q)
            chunk_data = subsampled_data[i:end_idx]
            chunk_weights = cluster_weights[i:end_idx]
            
            similarity_matrix[i:end_idx] = compute_cpu_similarity(
                chunk_data,
                subsampled_data,
                chunk_weights,
                cluster_weights
            )
            
            # Let's clear memory after each chunk
            gc.collect()
        
        return similarity_matrix

def get_binary_csv_iterator(file_path, chunk_size=1000):
    """Create iterator for binary-only columns from CSV."""
    # Load a small sample to detect binary columns
    df_info = pd.read_csv(file_path, nrows=100)
    binary_columns = df_info.columns[(df_info.isin([0, 1]).all())]
    
    return pd.read_csv(
        file_path,
        usecols=binary_columns,
        chunksize=chunk_size,
        dtype=np.float32
    )

def compute_similarity_from_csv(csv_file, kmax=3, B=100, m=0.2):
    print(f"Processing {csv_file}...")
    print(f"Parameters: kmax={kmax}, B={B}, m={m}")
    
    file_size = os.path.getsize(csv_file) / (1024 ** 2)
    chunk_size = min(1000, max(100, int(50 * 1024 / file_size)))
    
    csv_iterator = get_binary_csv_iterator(csv_file, chunk_size)
    
    total_rows = sum(1 for _ in open(csv_file)) - 1
    first_chunk = next(csv_iterator)
    n_features = first_chunk.shape[1]
    
    csv_iterator = get_binary_csv_iterator(csv_file, chunk_size)
    
    computer = MemoryEfficientSimilarityComputer(max_chunk_size=chunk_size, B=B)
    
    start_memory = psutil.Process().memory_info().rss / (1024 ** 2)
    start_time = time.time()
    
    similarity_matrix = computer.compute_similarity_matrix(
        csv_iterator,
        total_rows,
        n_features,
        m=m,
        k=kmax
    )
    
    end_time = time.time()
    end_memory = psutil.Process().memory_info().rss / (1024 ** 2)
    
    print("\nPerformance Metrics:")
    print(f"Execution time: {end_time - start_time:.2f} seconds")
    print(f"Peak memory usage: {end_memory - start_memory:.2f} MB")
    print(f"Matrix shape: {similarity_matrix.shape}")
    
    return similarity_matrix

if __name__ == "__main__":
    csv_file_path = "lingBinary.csv"
    
    try:
        similarity_matrix = compute_similarity_from_csv(
            csv_file_path,
            kmax=10,
            B=100,
            m=0.8
        )
        
        print("\nSimilarity Matrix Statistics:")
        print(f"Min similarity: {similarity_matrix.min():.3f}")
        print(f"Max similarity: {similarity_matrix.max():.3f}")
        print(f"Mean similarity: {similarity_matrix.mean():.3f}")
        
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()
    finally:
        gc.collect()


Processing lingBinary.csv...
Parameters: kmax=10, B=100, m=0.8
Loading sampled data...
Performing MiniBatch k-means clustering with 100 batches...
Processing similarities in 37 chunks...


100%|██████████| 37/37 [02:51<00:00,  4.63s/it]



Performance Metrics:
Execution time: 172.62 seconds
Peak memory usage: 5007.20 MB
Matrix shape: (36121, 36121)

Similarity Matrix Statistics:
Min similarity: 0.000
Max similarity: 1.000
Mean similarity: 0.211


In [8]:
# Print a sample of the similarity matrix (first 5x5 block)
print("\nSample of the Similarity Matrix (first 5x5):")
print(similarity_matrix[:5, :5])


Sample of the Similarity Matrix (first 5x5):
[[1.         0.16053256 0.22335605 0.17087835 0.14970274]
 [0.16053256 1.         0.43323073 0.49403    0.3304244 ]
 [0.22335605 0.43323073 1.         0.39416608 0.2961134 ]
 [0.17087835 0.49403    0.39416608 1.         0.19232179]
 [0.14970274 0.3304244  0.2961134  0.19232179 1.        ]]


In [None]:
import numpy as np
import pandas as pd

def save_similarity_matrix_to_csv(matrix, filename="similarity_matrix.csv"):
    df = pd.DataFrame(matrix)
    df.to_csv(filename, index=True)
    print(f"Similarity matrix saved to '{filename}'")

if __name__ == "__main__":
    similarity_matrix = similarity_matrix

    # Save the matrix to a CSV file
    save_similarity_matrix_to_csv(similarity_matrix)