In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
import re
import time
import warnings
warnings.filterwarnings('ignore')

def efficient_clustering_pipeline(input_file, material_type='FIN'):
    """
    Efficient clustering pipeline for large-scale industrial part descriptions
    with intelligent K determination and optimized processing
    """
    # Start timer
    start_time = time.time()
    
    # Load and filter data
    print("Loading data...")
    df = pd.read_csv(input_file)
    df_fin = df[df['material_type'] == material_type].copy().reset_index(drop=True)
    print(f"Found {len(df_fin)} {material_type} records")
    
    # Text cleaning function
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    # Clean descriptions
    df_fin['clean_desc'] = df_fin['material_description'].apply(clean_text)
    descriptions = df_fin['clean_desc'].tolist()
    
    # Load embedding model (using a balanced model for speed/accuracy)
    print("Loading embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Generate embeddings in batches with progress tracking
    print("Generating embeddings...")
    batch_size = 2000
    embeddings = []
    
    for i in range(0, len(df_fin), batch_size):
        batch_descs = df_fin['clean_desc'].iloc[i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch_descs, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
        if i % 10000 == 0:
            print(f"Processed {min(i+batch_size, len(df_fin))}/{len(df_fin)}")
    
    embeddings = np.array(embeddings)
    
    # Intelligent K determination using sampling and silhouette analysis
    print("Determining optimal cluster count...")
    
    # Method 1: Sample-based estimation
    sample_size = min(5000, len(embeddings))
    sample_indices = np.random.choice(len(embeddings), sample_size, replace=False)
    sample_embeddings = embeddings[sample_indices]
    
    # Try a range of K values on the sample
    k_candidates = [50, 100, 150, 200, 250, 300]
    best_k = 100  # Default
    best_silhouette = -1
    
    for k in k_candidates:
        if k >= sample_size:
            continue
            
        kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=500, n_init=1)
        cluster_labels = kmeans.fit_predict(sample_embeddings)
        
        # Use a subset for silhouette score to save time
        sil_sample_size = min(1000, len(sample_embeddings))
        sil_indices = np.random.choice(len(sample_embeddings), sil_sample_size, replace=False)
        
        try:
            silhouette_avg = silhouette_score(sample_embeddings[sil_indices], 
                                            cluster_labels[sil_indices])
            if silhouette_avg > best_silhouette:
                best_silhouette = silhouette_avg
                best_k = k
        except:
            continue
    
    # Method 2: Density-based estimation (fallback)
    if best_k <= 10:  # If silhouette method didn't work well
        nbrs = NearestNeighbors(n_neighbors=10).fit(sample_embeddings)
        distances, indices = nbrs.kneighbors(sample_embeddings)
        avg_distances = np.mean(distances, axis=1)
        density_estimate = np.percentile(avg_distances, 75)
        best_k = max(50, min(300, int(sample_size / (density_estimate * 100))))
    
    print(f"Selected K: {best_k}")
    
    # Perform clustering with optimal K
    print("Clustering...")
    kmeans = MiniBatchKMeans(n_clusters=best_k, random_state=42, 
                            batch_size=1000, n_init=3, max_iter=100)
    cluster_labels = kmeans.fit_predict(embeddings)
    
    # Post-processing: Handle small clusters
    print("Post-processing clusters...")
    cluster_counts = pd.Series(cluster_labels).value_counts()
    small_clusters = cluster_counts[cluster_counts <= 2].index.tolist()
    
    # For each small cluster, check if it should be merged
    for small_cluster in small_clusters:
        small_indices = np.where(cluster_labels == small_cluster)[0]
        
        if len(small_indices) == 0:
            continue
            
        # Find the nearest cluster centroid
        small_embeddings = embeddings[small_indices]
        distances = np.linalg.norm(
            kmeans.cluster_centers_ - np.mean(small_embeddings, axis=0), 
            axis=1
        )
        nearest_cluster = np.argmin(distances)
        
        # Only merge if very close (conservative threshold)
        if distances[nearest_cluster] < 0.4:
            cluster_labels[small_indices] = nearest_cluster
    
    # Generate meaningful cluster names
    print("Generating cluster names...")
    cluster_names = {}
    
    for cluster_id in range(best_k):
        cluster_mask = (cluster_labels == cluster_id)
        cluster_descs = df_fin['material_description'][cluster_mask].tolist()
        
        if not cluster_descs:
            cluster_names[cluster_id] = f"cluster_{cluster_id}"
            continue
            
        # Extract most common meaningful terms
        all_terms = []
        for desc in cluster_descs:
            terms = clean_text(desc).split()
            # Filter out very short terms
            terms = [term for term in terms if len(term) > 3]
            all_terms.extend(terms)
        
        if not all_terms:
            # Fallback to first few words of first description
            first_desc = clean_text(cluster_descs[0])
            cluster_names[cluster_id] = "_".join(first_desc.split()[:3])
            continue
            
        # Get most frequent terms
        term_counts = pd.Series(all_terms).value_counts()
        top_terms = term_counts.head(3).index.tolist()
        cluster_names[cluster_id] = "_".join(top_terms)
    
    # Add results to dataframe
    df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
    df_fin['cluster'] = cluster_labels
    
    # Save output
    output_cols = ['material_number', 'material_type', 'material_description', 'proposedkey', 'cluster']
    df_fin[output_cols].to_csv('output.csv', index=False)
    
    # Print completion message
    end_time = time.time()
    total_time = end_time - start_time
    print(f"Processing completed in {total_time/60:.2f} minutes")
    print(f"Created {best_k} clusters")
    print(f"Output saved to output.csv")
    
    return df_fin

# Run the pipeline
if __name__ == "__main__":
    df_result = efficient_clustering_pipeline('input.csv', 'FIN')