In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
import re
import time
import json

def advanced_clustering(input_file, material_type='FIN', n_clusters=800, similarity_threshold=0.8):
    """
    Advanced clustering with automated validation and dynamic term discovery
    """
    start_time = time.time()
    print(f"Starting advanced clustering for {material_type}")
    
    # Load data
    df = pd.read_csv(input_file)
    df_fin = df[df['material_type'] == material_type].copy().reset_index(drop=True)
    print(f"Found {len(df_fin)} {material_type} records")
    
    # Text cleaning with language preservation
    def clean_text(text):
        text = str(text)
        text = re.sub(r'[^\w\säöüßàâçéèêëîïôùûüÿæøå]', ' ', text, flags=re.IGNORECASE)
        text = re.sub(r'\s+', ' ', text).strip()
        return text.lower()
    
    df_fin['clean_desc'] = df_fin['material_description'].apply(clean_text)
    descriptions = df_fin['clean_desc'].tolist()
    
    # Dynamically discover important terms from the dataset
    print("Discovering important terms from dataset...")
    def discover_industrial_terms(descriptions, top_n=50):
        # Use TF-IDF to find important terms
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(descriptions)
        feature_names = vectorizer.get_feature_names_out()
        term_importance = np.array(tfidf_matrix.sum(axis=0)).flatten()
        
        # Get top terms
        top_indices = np.argsort(term_importance)[-top_n:]
        return set(feature_names[i] for i in top_indices if term_importance[i] > 0)
    
    industrial_terms = discover_industrial_terms(descriptions)
    print(f"Discovered {len(industrial_terms)} important terms: {list(industrial_terms)[:10]}...")
    
    # Use multilingual embedding model
    print("Loading multilingual embedding model...")
    model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    
    # Generate embeddings
    print("Generating multilingual embeddings...")
    batch_size = 1000
    embeddings = []
    for i in range(0, len(df_fin), batch_size):
        batch_descs = df_fin['clean_desc'].iloc[i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch_descs, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
        if (i // batch_size) % 10 == 0:
            print(f"Processed {min(i+batch_size, len(df_fin))}/{len(df_fin)}")
    embeddings = np.array(embeddings)
    
    # Create domain-aware features using TF-IDF
    print("Creating domain-aware features...")
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df_fin['clean_desc'])
    
    # Get feature names and boost discovered industrial terms
    feature_names = vectorizer.get_feature_names_out()
    industrial_indices = [i for i, term in enumerate(feature_names) if term in industrial_terms]
    
    # Enhance industrial term weights
    if len(industrial_indices) > 0:
        tfidf_matrix[:, industrial_indices] = tfidf_matrix[:, industrial_indices] * 2
    
    # Convert to dense array and combine with embeddings
    tfidf_dense = tfidf_matrix.toarray()
    combined_features = np.hstack([embeddings, tfidf_dense])
    
    # Perform clustering with combined features
    print(f"Clustering with {n_clusters} clusters...")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, 
                            batch_size=1000, n_init=3, max_iter=100)
    cluster_labels = kmeans.fit_predict(combined_features)
    
    # Multi-stage cluster merging
    print("Performing multi-stage cluster merging...")
    centroids = kmeans.cluster_centers_
    
    # Stage 1: Merge based on semantic similarity (embedding part)
    embedding_centroids = centroids[:, :embeddings.shape[1]]
    similarity_matrix = cosine_similarity(embedding_centroids)
    
    to_merge = {}
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            if similarity_matrix[i, j] > similarity_threshold:
                if i not in to_merge:
                    to_merge[i] = j
    
    # Apply stage 1 merging
    new_labels = cluster_labels.copy()
    for old_cluster, new_cluster in to_merge.items():
        new_labels[new_labels == old_cluster] = new_cluster
    
    # Stage 2: Merge based on keyword similarity (TF-IDF part)
    cluster_keywords = {}
    for cluster_id in np.unique(new_labels):
        cluster_indices = np.where(new_labels == cluster_id)[0]
        if len(cluster_indices) > 0:
            # Get top keywords for this cluster
            cluster_tfidf = tfidf_matrix[cluster_indices].sum(axis=0).A1
            top_keyword_indices = np.argsort(cluster_tfidf)[-5:]  # Top 5 keywords
            cluster_keywords[cluster_id] = set(feature_names[i] for i in top_keyword_indices 
                                             if cluster_tfidf[i] > 0)
    
    # Merge clusters with similar keywords
    for i, cluster_i in enumerate(cluster_keywords.keys()):
        for j, cluster_j in enumerate(cluster_keywords.keys()):
            if i >= j:
                continue
            common_keywords = cluster_keywords[cluster_i] & cluster_keywords[cluster_j]
            if len(common_keywords) >= 3:  # At least 3 common keywords
                new_labels[new_labels == cluster_j] = cluster_i
    
    # Renumber clusters
    unique_clusters = np.unique(new_labels)
    mapping = {old: new for new, old in enumerate(unique_clusters)}
    cluster_labels = np.array([mapping[label] for label in new_labels])
    n_final_clusters = len(unique_clusters)
    print(f"Final cluster count: {n_final_clusters}")
    
    # Generate cluster names with multi-language support
    print("Generating cluster names...")
    cluster_names = {}
    
    for cluster_id in range(n_final_clusters):
        cluster_descs = df_fin['material_description'][cluster_labels == cluster_id].tolist()
        if not cluster_descs:
            cluster_names[cluster_id] = f"cluster_{cluster_id}"
            continue
        
        # Extract terms from all languages
        all_terms = []
        for desc in cluster_descs:
            desc_clean = re.sub(r'[^\w\säöüßàâçéèêëîïôùûüÿæøå]', ' ', desc, flags=re.IGNORECASE)
            terms = desc_clean.lower().split()
            terms = [term for term in terms if len(term) > 3]
            all_terms.extend(terms)
        
        if not all_terms:
            cluster_names[cluster_id] = "_".join(cluster_descs[0].split()[:3])
            continue
        
        # Get most frequent terms
        term_counts = pd.Series(all_terms).value_counts()
        top_terms = term_counts.head(3).index.tolist()
        cluster_names[cluster_id] = "_".join(top_terms)
    
    # Add results to dataframe
    df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
    df_fin['cluster'] = cluster_labels
    
    # Automated validation and quality reporting
    print("Performing automated validation...")
    validation_report = validate_clusters(df_fin, embeddings, cluster_labels, cluster_names)
    
    # Save output and validation report
    output_cols = ['material_number', 'material_type', 'material_description', 'proposedkey', 'cluster']
    output_file = f'advanced_output_k_{n_clusters}.csv'
    df_fin[output_cols].to_csv(output_file, index=False)
    
    # Save validation report
    report_file = f'validation_report_k_{n_clusters}.json'
    with open(report_file, 'w') as f:
        json.dump(validation_report, f, indent=2)
    
    # Print statistics
    end_time = time.time()
    total_time = end_time - start_time
    cluster_sizes = df_fin['cluster'].value_counts()
    
    print(f"\nCluster Statistics:")
    print(f"Total clusters: {len(cluster_sizes)}")
    print(f"Clusters with > 10 items: {len(cluster_sizes[cluster_sizes > 10])}")
    print(f"Clusters with > 5 items: {len(cluster_sizes[cluster_sizes > 5])}")
    print(f"Clusters with 1-2 items: {len(cluster_sizes[cluster_sizes <= 2])}")
    print(f"Processing time: {total_time/60:.2f} minutes")
    print(f"Output saved to {output_file}")
    print(f"Validation report saved to {report_file}")
    
    return df_fin, validation_report

def validate_clusters(df, embeddings, cluster_labels, cluster_names):
    """
    Automated cluster validation using multiple metrics
    """
    report = {
        "overall_metrics": {},
        "cluster_quality": {},
        "potential_issues": []
    }
    
    # Calculate overall metrics
    try:
        # Silhouette score (on a sample for large datasets)
        sample_size = min(5000, len(embeddings))
        sample_indices = np.random.choice(len(embeddings), sample_size, replace=False)
        silhouette_avg = silhouette_score(embeddings[sample_indices], cluster_labels[sample_indices])
        report["overall_metrics"]["silhouette_score"] = silhouette_avg
    except:
        report["overall_metrics"]["silhouette_score"] = "Calculation failed"
    
    # Calculate intra-cluster similarity
    unique_clusters = np.unique(cluster_labels)
    intra_cluster_similarities = []
    
    for cluster_id in unique_clusters:
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        if len(cluster_indices) > 1:
            cluster_embeddings = embeddings[cluster_indices]
            centroid = np.mean(cluster_embeddings, axis=0)
            similarities = cosine_similarity(cluster_embeddings, [centroid])
            intra_cluster_similarities.append(np.mean(similarities))
    
    report["overall_metrics"]["avg_intra_cluster_similarity"] = np.mean(intra_cluster_similarities) if intra_cluster_similarities else 0
    
    # Analyze each cluster
    for cluster_id in unique_clusters:
        cluster_data = df[cluster_labels == cluster_id]
        cluster_size = len(cluster_data)
        
        if cluster_size == 0:
            continue
            
        # Calculate cluster cohesion
        cluster_indices = np.where(cluster_labels == cluster_id)[0]
        cluster_embeddings = embeddings[cluster_indices]
        
        if len(cluster_embeddings) > 1:
            centroid = np.mean(cluster_embeddings, axis=0)
            similarities = cosine_similarity(cluster_embeddings, [centroid])
            cohesion = np.mean(similarities)
        else:
            cohesion = 1  # Single-item cluster
            
        # Check for potential issues
        issues = []
        if cluster_size == 1:
            issues.append("singleton_cluster")
        elif cohesion < 0.5:
            issues.append("low_cohesion")
            
        report["cluster_quality"][str(cluster_id)] = {
            "size": cluster_size,
            "cohesion": float(cohesion),
            "name": cluster_names.get(cluster_id, "unknown"),
            "issues": issues
        }
        
        # Add to potential issues list
        if issues:
            report["potential_issues"].append({
                "cluster_id": cluster_id,
                "issues": issues,
                "size": cluster_size,
                "cohesion": float(cohesion)
            })
    
    # Calculate cluster size distribution
    cluster_sizes = [info["size"] for info in report["cluster_quality"].values()]
    report["overall_metrics"]["avg_cluster_size"] = np.mean(cluster_sizes)
    report["overall_metrics"]["max_cluster_size"] = np.max(cluster_sizes)
    report["overall_metrics"]["min_cluster_size"] = np.min(cluster_sizes)
    
    return report

if __name__ == "__main__":
    # Run the advanced clustering
    result, report = advanced_clustering(
        input_file='input.csv', 
        material_type='FIN', 
        n_clusters=1000, 
        similarity_threshold=0.8
    )
    
    # Print summary of validation results
    print("\nValidation Summary:")
    print(f"Silhouette Score: {report['overall_metrics'].get('silhouette_score', 'N/A')}")
    print(f"Average Intra-Cluster Similarity: {report['overall_metrics'].get('avg_intra_cluster_similarity', 'N/A'):.3f}")
    print(f"Potential Issues Found: {len(report['potential_issues'])}")
    
    # Show top 5 largest clusters
    cluster_quality = report['cluster_quality']
    largest_clusters = sorted(cluster_quality.items(), key=lambda x: x[1]['size'], reverse=True)[:5]
    print("\nTop 5 Largest Clusters:")
    for cluster_id, info in largest_clusters:
        print(f"Cluster {cluster_id} ({info['name']}): {info['size']} items, cohesion: {info['cohesion']:.3f}")