In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
import re
import time

def cluster_material_descriptions(input_file, material_type='FIN', n_clusters=300):
    """
    Cluster material descriptions with adjustable K value
    
    Parameters:
    input_file: path to input CSV file
    material_type: material type to filter (default: 'FIN')
    n_clusters: number of clusters to create (adjustable parameter)
    """
    # Start timer
    start_time = time.time()
    print(f"Starting clustering for {material_type} with K={n_clusters}")
    
    # Load and filter data
    print("Loading data...")
    df = pd.read_csv(input_file)
    df_fin = df[df['material_type'] == material_type].copy().reset_index(drop=True)
    print(f"Found {len(df_fin)} {material_type} records")
    
    # Text cleaning function
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    # Clean descriptions
    df_fin['clean_desc'] = df_fin['material_description'].apply(clean_text)
    descriptions = df_fin['clean_desc'].tolist()
    
    # Get unique tokens for synonym mapping (sample for efficiency)
    print("Creating synonym mapping...")
    all_tokens = set()
    sample_size = min(5000, len(descriptions))
    for desc in descriptions[:sample_size]:
        all_tokens.update(desc.split())
    all_tokens = list(all_tokens)
    
    # Create synonym mapping using fuzzy matching
    token_mapping = {}
    for token in all_tokens:
        # Find similar tokens
        matches = process.extract(token, all_tokens, scorer=fuzz.token_sort_ratio, limit=5)
        best_match = token
        for match, score, _ in matches:
            if score > 85 and len(match) < len(best_match):  # Prefer shorter forms
                best_match = match
        token_mapping[token] = best_match
    
    # Apply synonym normalization
    print("Applying synonym normalization...")
    def normalize_tokens(desc):
        tokens = desc.split()
        normalized = [token_mapping.get(token, token) for token in tokens]
        return ' '.join(normalized)
    
    df_fin['normalized_desc'] = df_fin['clean_desc'].apply(normalize_tokens)
    
    # Generate embeddings
    print("Generating embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    batch_size = 2000
    embeddings = []
    for i in range(0, len(df_fin), batch_size):
        batch_descs = df_fin['normalized_desc'].iloc[i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch_descs, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
        if (i // batch_size) % 5 == 0:  # Print progress every 5 batches
            print(f"Processed {min(i+batch_size, len(df_fin))}/{len(df_fin)}")
    
    embeddings = np.array(embeddings)
    
    # Perform clustering with specified K
    print(f"Clustering with K={n_clusters}...")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, 
                            batch_size=1000, n_init=3, max_iter=100)
    cluster_labels = kmeans.fit_predict(embeddings)
    
    # Post-processing: Handle small clusters
    print("Post-processing clusters...")
    cluster_counts = pd.Series(cluster_labels).value_counts()
    small_clusters = cluster_counts[cluster_counts <= 2].index.tolist()
    
    # For each small cluster, check if it should be merged
    for small_cluster in small_clusters:
        small_indices = np.where(cluster_labels == small_cluster)[0]
        
        if len(small_indices) == 0:
            continue
            
        # Get centroid of small cluster
        if len(small_indices) == 1:
            small_center = embeddings[small_indices[0]]
        else:
            small_center = np.mean(embeddings[small_indices], axis=0)
        
        # Find closest large cluster
        min_distance = float('inf')
        best_cluster = small_cluster
        
        for cluster_id in range(n_clusters):
            if cluster_counts.get(cluster_id, 0) > 2:  # Only consider large clusters
                distance = np.linalg.norm(small_center - kmeans.cluster_centers_[cluster_id])
                if distance < min_distance:
                    min_distance = distance
                    best_cluster = cluster_id
        
        # Only merge if very close (conservative threshold)
        if min_distance < 0.4:
            cluster_labels[small_indices] = best_cluster
    
    # Generate cluster names
    print("Generating cluster names...")
    cluster_names = {}
    
    for cluster_id in range(n_clusters):
        cluster_mask = (cluster_labels == cluster_id)
        cluster_descs = df_fin['material_description'][cluster_mask].tolist()
        
        if not cluster_descs:
            cluster_names[cluster_id] = f"cluster_{cluster_id}"
            continue
            
        # Extract most common meaningful terms
        all_terms = []
        for desc in cluster_descs:
            terms = clean_text(desc).split()
            # Filter out very short terms
            terms = [term for term in terms if len(term) > 3]
            all_terms.extend(terms)
        
        if not all_terms:
            # Fallback to first few words of first description
            first_desc = clean_text(cluster_descs[0])
            cluster_names[cluster_id] = "_".join(first_desc.split()[:3])
            continue
            
        # Get most frequent terms
        term_counts = pd.Series(all_terms).value_counts()
        top_terms = term_counts.head(3).index.tolist()
        cluster_names[cluster_id] = "_".join(top_terms)
    
    # Add results to dataframe
    df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
    df_fin['cluster'] = cluster_labels
    
    # Save output
    output_cols = ['material_number', 'material_type', 'material_description', 'proposedkey', 'cluster']
    output_file = f'output_k_{n_clusters}.csv'
    df_fin[output_cols].to_csv(output_file, index=False)
    
    # Print completion message
    end_time = time.time()
    total_time = end_time - start_time
    
    # Print cluster statistics
    cluster_sizes = df_fin['cluster'].value_counts()
    print(f"\nCluster Statistics for K={n_clusters}:")
    print(f"Total clusters: {len(cluster_sizes)}")
    print(f"Clusters with > 10 items: {len(cluster_sizes[cluster_sizes > 10])}")
    print(f"Clusters with > 5 items: {len(cluster_sizes[cluster_sizes > 5])}")
    print(f"Clusters with 1-2 items: {len(cluster_sizes[cluster_sizes <= 2])}")
    print(f"Largest cluster size: {cluster_sizes.max()}")
    print(f"Average cluster size: {cluster_sizes.mean():.2f}")
    
    print(f"\nProcessing completed in {total_time/60:.2f} minutes")
    print(f"Output saved to {output_file}")
    
    return df_fin

# Run the clustering with a single K value
if __name__ == "__main__":
    # Set your desired K value here
    K_VALUE = 1000  # Change this to any value you want to try
    
    print(f"{'='*60}")
    print(f"RUNNING CLUSTERING WITH K = {K_VALUE}")
    print(f"{'='*60}")
    
    result = cluster_material_descriptions('input.csv', 'FIN', n_clusters=K_VALUE)