In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
import re
import time

def cluster_material_descriptions(input_file, material_type='FIN', n_clusters=1000, similarity_threshold=0.8):
    """
    Cluster material descriptions with aggressive post-merging.
    
    Parameters:
    input_file: path to input CSV file
    material_type: material type to filter (default: 'FIN')
    n_clusters: initial number of clusters (high value for fine-grained)
    similarity_threshold: threshold for merging clusters (higher means less aggressive)
    """
    start_time = time.time()
    print(f"Starting clustering for {material_type} with initial K={n_clusters}")
    
    # Load and filter data
    print("Loading data...")
    df = pd.read_csv(input_file)
    df_fin = df[df['material_type'] == material_type].copy().reset_index(drop=True)
    print(f"Found {len(df_fin)} {material_type} records")
    
    # Text cleaning function
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)  # Replace special chars with space
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    df_fin['clean_desc'] = df_fin['material_description'].apply(clean_text)
    descriptions = df_fin['clean_desc'].tolist()
    
    # Get unique tokens for synonym mapping (sample for efficiency)
    print("Creating synonym mapping...")
    all_tokens = set()
    sample_size = min(10000, len(descriptions))  # Larger sample for better coverage
    for desc in descriptions[:sample_size]:
        all_tokens.update(desc.split())
    all_tokens = list(all_tokens)
    
    # Create synonym mapping using fuzzy matching with emphasis on short words
    token_mapping = {}
    for token in all_tokens:
        # For short tokens (likely abbreviations), use higher threshold
        if len(token) <= 4:
            matches = process.extract(token, all_tokens, scorer=fuzz.token_sort_ratio, limit=10)
            best_match = token
            for match, score, _ in matches:
                if score > 90:  # High threshold for short words
                    best_match = match
                    break
        else:
            matches = process.extract(token, all_tokens, scorer=fuzz.token_sort_ratio, limit=5)
            best_match = token
            for match, score, _ in matches:
                if score > 85 and len(match) < len(best_match):
                    best_match = match
        token_mapping[token] = best_match
    
    # Apply synonym normalization
    print("Applying synonym normalization...")
    def normalize_tokens(desc):
        tokens = desc.split()
        normalized = [token_mapping.get(token, token) for token in tokens]
        return ' '.join(normalized)
    
    df_fin['normalized_desc'] = df_fin['clean_desc'].apply(normalize_tokens)
    
    # Generate embeddings
    print("Generating embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    batch_size = 2000
    embeddings = []
    for i in range(0, len(df_fin), batch_size):
        batch_descs = df_fin['normalized_desc'].iloc[i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch_descs, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
        if (i // batch_size) % 10 == 0:
            print(f"Processed {min(i+batch_size, len(df_fin))}/{len(df_fin)}")
    embeddings = np.array(embeddings)
    
    # Perform clustering with high K
    print(f"Clustering with K={n_clusters}...")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1000, n_init=3, max_iter=100)
    cluster_labels = kmeans.fit_predict(embeddings)
    
    # Aggressive post-merging based on centroid similarity
    print("Aggressive merging of similar clusters...")
    centroids = kmeans.cluster_centers_
    similarity_matrix = cosine_similarity(centroids)
    
    # Find clusters to merge
    to_merge = {}
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            if similarity_matrix[i, j] > similarity_threshold:
                if i not in to_merge:
                    to_merge[i] = j
                else:
                    # Chain merging: if i is already merging to k, then j should merge to k too
                    to_merge[j] = to_merge[i]
    
    # Apply merging
    new_labels = cluster_labels.copy()
    for old_cluster, new_cluster in to_merge.items():
        new_labels[new_labels == old_cluster] = new_cluster
    
    # Renumber clusters to avoid gaps
    unique_clusters = np.unique(new_labels)
    mapping = {old: new for new, old in enumerate(unique_clusters)}
    cluster_labels = np.array([mapping[label] for label in new_labels])
    n_clusters_after_merge = len(unique_clusters)
    print(f"Merged from {n_clusters} to {n_clusters_after_merge} clusters")
    
    # Handle small clusters by merging them with the nearest large cluster
    print("Handling small clusters...")
    cluster_counts = pd.Series(cluster_labels).value_counts()
    small_clusters = cluster_counts[cluster_counts <= 2].index.tolist()
    
    for small_cluster in small_clusters:
        small_indices = np.where(cluster_labels == small_cluster)[0]
        if len(small_indices) == 0:
            continue
        small_center = np.mean(embeddings[small_indices], axis=0)
        min_distance = float('inf')
        best_cluster = small_cluster
        for cluster_id in range(n_clusters_after_merge):
            if cluster_counts.get(cluster_id, 0) > 2:
                distance = np.linalg.norm(small_center - centroids[cluster_id])
                if distance < min_distance:
                    min_distance = distance
                    best_cluster = cluster_id
        if min_distance < 0.5:
            cluster_labels[small_indices] = best_cluster
    
    # Generate cluster names
    print("Generating cluster names...")
    cluster_names = {}
    for cluster_id in range(n_clusters_after_merge):
        cluster_descs = df_fin['material_description'][cluster_labels == cluster_id].tolist()
        if not cluster_descs:
            cluster_names[cluster_id] = f"cluster_{cluster_id}"
            continue
        all_terms = []
        for desc in cluster_descs:
            terms = clean_text(desc).split()
            terms = [term for term in terms if len(term) > 3]  # Filter short terms
            all_terms.extend(terms)
        if not all_terms:
            first_desc = clean_text(cluster_descs[0])
            cluster_names[cluster_id] = "_".join(first_desc.split()[:3])
            continue
        term_counts = pd.Series(all_terms).value_counts()
        top_terms = term_counts.head(3).index.tolist()
        cluster_names[cluster_id] = "_".join(top_terms)
    
    # Add results to dataframe
    df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
    df_fin['cluster'] = cluster_labels
    
    # Save output
    output_cols = ['material_number', 'material_type', 'material_description', 'proposedkey', 'cluster']
    output_file = f'output_k_{n_clusters}_merged.csv'
    df_fin[output_cols].to_csv(output_file, index=False)
    
    # Print statistics
    end_time = time.time()
    total_time = end_time - start_time
    cluster_sizes = df_fin['cluster'].value_counts()
    print(f"\nCluster Statistics after merging:")
    print(f"Total clusters: {len(cluster_sizes)}")
    print(f"Clusters with > 10 items: {len(cluster_sizes[cluster_sizes > 10])}")
    print(f"Clusters with > 5 items: {len(cluster_sizes[cluster_sizes > 5])}")
    print(f"Clusters with 1-2 items: {len(cluster_sizes[cluster_sizes <= 2])}")
    print(f"Processing time: {total_time/60:.2f} minutes")
    print(f"Output saved to {output_file}")
    
    return df_fin

if __name__ == "__main__":
    # Adjust parameters here
    input_file = 'input.csv'
    material_type = 'FIN'
    initial_k = 1000  # Start with a high K for fine-grained clusters
    similarity_threshold = 0.8  # Lower for more aggressive merging (0.7-0.9)
    
    result = cluster_material_descriptions(input_file, material_type, n_clusters=initial_k, similarity_threshold=similarity_threshold)