In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import time

def enhanced_clustering(input_file, material_type='FIN', n_clusters=1000, similarity_threshold=0.8):
    """
    Enhanced clustering with better multilingual support using existing models
    """
    start_time = time.time()
    print(f"Starting enhanced clustering for {material_type}")
    
    # Load data
    df = pd.read_csv(input_file)
    df_fin = df[df['material_type'] == material_type].copy().reset_index(drop=True)
    print(f"Found {len(df_fin)} {material_type} records")
    
    # Enhanced text cleaning that preserves multilingual characters
    def clean_text(text):
        text = str(text)
        # Keep alphanumeric, spaces, and common multilingual characters
        text = re.sub(r'[^\w\säöüßàâçéèêëîïôùûüÿæøå]', ' ', text, flags=re.IGNORECASE)
        text = re.sub(r'\s+', ' ', text).strip()
        return text.lower()
    
    df_fin['clean_desc'] = df_fin['material_description'].apply(clean_text)
    descriptions = df_fin['clean_desc'].tolist()
    
    # Create a manual multilingual synonym dictionary
    print("Creating enhanced synonym mapping...")
    
    # Common multilingual industrial term mappings
    multilingual_mappings = {
        # English to German/French common industrial terms
        'sensor': ['sensor', 'fühler', 'geber', 'capteur'],
        'unit': ['unit', 'einheit', 'unité', 'gerät'],
        'cable': ['cable', 'kabel', 'câble', 'leitung'],
        'connector': ['connector', 'stecker', 'connecteur', 'anschluss'],
        'module': ['module', 'modul', 'module', 'baugruppe'],
        'adapter': ['adapter', 'adapter', 'adaptateur', 'zwischenstück'],
        'assembly': ['assembly', 'baugruppe', 'assemblage', 'montage'],
        'assy': ['assy', 'baugr', 'ens', 'mont'],
    }
    
    # Create reverse mapping for all terms
    expanded_mapping = {}
    for english_term, variants in multilingual_mappings.items():
        for variant in variants:
            expanded_mapping[variant] = english_term
    
    # Get unique tokens from data
    all_tokens = set()
    sample_size = min(10000, len(descriptions))
    for desc in descriptions[:sample_size]:
        all_tokens.update(desc.split())
    all_tokens = list(all_tokens)
    
    # Enhanced token mapping with multilingual support
    token_mapping = {}
    for token in all_tokens:
        # First check if it's in our multilingual dictionary
        if token in expanded_mapping:
            token_mapping[token] = expanded_mapping[token]
            continue
            
        # For short tokens (abbreviations), use higher threshold
        if len(token) <= 4:
            matches = process.extract(token, all_tokens, scorer=fuzz.token_sort_ratio, limit=10)
            best_match = token
            for match, score, _ in matches:
                if score > 90:  # High threshold for short words
                    best_match = match
                    break
        else:
            matches = process.extract(token, all_tokens, scorer=fuzz.token_sort_ratio, limit=5)
            best_match = token
            for match, score, _ in matches:
                if score > 85 and len(match) < len(best_match):
                    best_match = match
        token_mapping[token] = best_match
    
    # Apply synonym normalization
    print("Applying enhanced normalization...")
    def normalize_tokens(desc):
        tokens = desc.split()
        normalized = [token_mapping.get(token, token) for token in tokens]
        return ' '.join(normalized)
    
    df_fin['normalized_desc'] = df_fin['clean_desc'].apply(normalize_tokens)
    
    # Generate embeddings
    print("Generating embeddings...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    batch_size = 2000
    embeddings = []
    for i in range(0, len(df_fin), batch_size):
        batch_descs = df_fin['normalized_desc'].iloc[i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch_descs, show_progress_bar=False)
        embeddings.extend(batch_embeddings)
        if (i // batch_size) % 10 == 0:
            print(f"Processed {min(i+batch_size, len(df_fin))}/{len(df_fin)}")
    embeddings = np.array(embeddings)
    
    # Add TF-IDF features to help with multilingual matching
    print("Adding multilingual features...")
    vectorizer = TfidfVectorizer(
        analyzer='char', 
        ngram_range=(2, 4),  # Character n-grams work across languages
        max_features=500
    )
    tfidf_features = vectorizer.fit_transform(df_fin['normalized_desc']).toarray()
    
    # Combine embeddings with TF-IDF features
    combined_features = np.hstack([embeddings, tfidf_features])
    
    # Perform clustering
    print(f"Clustering with K={n_clusters}...")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, 
                            batch_size=1000, n_init=3, max_iter=100)
    cluster_labels = kmeans.fit_predict(combined_features)
    
    # Enhanced merging with multilingual consideration
    print("Multilingual-aware cluster merging...")
    centroids = kmeans.cluster_centers_
    similarity_matrix = cosine_similarity(centroids)
    
    to_merge = {}
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            if similarity_matrix[i, j] > similarity_threshold:
                if i not in to_merge:
                    to_merge[i] = j
    
    # Apply merging
    new_labels = cluster_labels.copy()
    for old_cluster, new_cluster in to_merge.items():
        new_labels[new_labels == old_cluster] = new_cluster
    
    # Renumber clusters
    unique_clusters = np.unique(new_labels)
    mapping = {old: new for new, old in enumerate(unique_clusters)}
    cluster_labels = np.array([mapping[label] for label in new_labels])
    n_final_clusters = len(unique_clusters)
    print(f"Merged to {n_final_clusters} clusters")
    
    # Enhanced cluster naming with multilingual support
    print("Generating multilingual cluster names...")
    cluster_names = {}
    
    for cluster_id in range(n_final_clusters):
        cluster_descs = df_fin['material_description'][cluster_labels == cluster_id].tolist()
        if not cluster_descs:
            cluster_names[cluster_id] = f"cluster_{cluster_id}"
            continue
        
        # Extract terms with multilingual support
        all_terms = []
        for desc in cluster_descs:
            # Clean but preserve multilingual characters
            desc_clean = re.sub(r'[^\w\säöüßàâçéèêëîïôùûüÿæøå]', ' ', desc, flags=re.IGNORECASE)
            terms = desc_clean.lower().split()
            terms = [term for term in terms if len(term) > 2]  # Keep shorter terms for multilingual
            all_terms.extend(terms)
        
        if not all_terms:
            cluster_names[cluster_id] = "_".join(cluster_descs[0].split()[:3])
            continue
        
        # Get most frequent terms
        term_counts = pd.Series(all_terms).value_counts()
        top_terms = term_counts.head(3).index.tolist()
        cluster_names[cluster_id] = "_".join(top_terms)
    
    # Add results to dataframe
    df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
    df_fin['cluster'] = cluster_labels
    
    # Automated validation for multilingual clusters
    print("Running automated multilingual validation...")
    validation_results = validate_multilingual_clusters(df_fin, cluster_labels)
    
    # Save output with validation info
    output_cols = ['material_number', 'material_type', 'material_description', 'proposedkey', 'cluster']
    output_file = f'enhanced_output_k_{n_clusters}.csv'
    df_fin[output_cols].to_csv(output_file, index=False)
    
    # Print validation results
    print(f"\nMultilingual Validation Results:")
    print(f"Clusters with mixed languages: {validation_results['mixed_language_clusters']}")
    print(f"Average cluster cohesion: {validation_results['avg_cohesion']:.3f}")
    print(f"Potential issues: {len(validation_results['potential_issues'])}")
    
    # Print statistics
    end_time = time.time()
    total_time = end_time - start_time
    cluster_sizes = df_fin['cluster'].value_counts()
    
    print(f"\nCluster Statistics:")
    print(f"Total clusters: {len(cluster_sizes)}")
    print(f"Processing time: {total_time/60:.2f} minutes")
    print(f"Output saved to {output_file}")
    
    return df_fin

def validate_multilingual_clusters(df, cluster_labels):
    """
    Automated validation for multilingual clustering
    """
    results = {
        'mixed_language_clusters': 0,
        'avg_cohesion': 0,
        'potential_issues': []
    }
    
    # Simple language detection patterns
    german_patterns = ['ä', 'ö', 'ü', 'ß', 'sch', 'ch', 'gen', 'ung']
    french_patterns = ['é', 'è', 'ê', 'à', 'ç', 'tion', 'ement', 'que']
    english_patterns = ['ing', 'tion', 'ment', 'able', 'ize']
    
    unique_clusters = np.unique(cluster_labels)
    cohesion_scores = []
    
    for cluster_id in unique_clusters:
        cluster_data = df[cluster_labels == cluster_id]
        if len(cluster_data) < 2:
            continue
            
        # Check for mixed languages
        languages = set()
        for desc in cluster_data['material_description']:
            desc_lower = desc.lower()
            has_german = any(pattern in desc_lower for pattern in german_patterns)
            has_french = any(pattern in desc_lower for pattern in french_patterns)
            has_english = any(pattern in desc_lower for pattern in english_patterns)
            
            if has_german: languages.add('german')
            if has_french: languages.add('french')
            if has_english: languages.add('english')
            if not any([has_german, has_french, has_english]): languages.add('unknown')
        
        if len(languages) > 1:
            results['mixed_language_clusters'] += 1
            results['potential_issues'].append({
                'cluster_id': cluster_id,
                'issue': f'mixed_languages: {languages}',
                'size': len(cluster_data)
            })
    
    return results

if __name__ == "__main__":
    # Run the enhanced clustering
    result = enhanced_clustering(
        input_file='input.csv', 
        material_type='FIN', 
        n_clusters=1000, 
        similarity_threshold=0.8
    )