In [None]:
import pandas as pd
import numpy as np
from rapidfuzz import process, fuzz
from sentence_transformers import SentenceTransformer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from kneed import KneeLocator
import re
import warnings
warnings.filterwarnings('ignore')

# Load the pre-trained sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Read the input CSV
df = pd.read_csv('input.csv')
# Filter for material_type 'FIN'
df_fin = df[df['material_type'] == 'FIN'].copy()
descriptions = df_fin['material_description'].astype(str).tolist()

# Step 1: Preprocessing function for text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation and special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Clean descriptions
cleaned_descriptions = [clean_text(desc) for desc in descriptions]

# Step 2: Automated synonym discovery
def get_unique_tokens(descriptions):
    tokens = set()
    for desc in descriptions:
        tokens.update(desc.split())
    return list(tokens)

unique_tokens = get_unique_tokens(cleaned_descriptions)

# Fuzzy matching for tokens
def find_fuzzy_matches(tokens, score_threshold=90):
    synonym_map = {}
    for token in tokens:
        matches = process.extract(token, tokens, scorer=fuzz.token_sort_ratio, limit=10)
        for match, score, _ in matches:
            if score >= score_threshold and token != match:
                if token not in synonym_map:
                    synonym_map[token] = set()
                synonym_map[token].add(match)
    return synonym_map

fuzzy_synonym_map = find_fuzzy_matches(unique_tokens, score_threshold=90)

# Embedding-based similarity for tokens
def find_embedding_similarities(tokens, model, similarity_threshold=0.7):
    token_embeddings = model.encode(tokens)
    similarity_matrix = cosine_similarity(token_embeddings)
    synonym_map = {}
    for i, token1 in enumerate(tokens):
        for j, token2 in enumerate(tokens):
            if i != j and similarity_matrix[i][j] >= similarity_threshold:
                if token1 not in synonym_map:
                    synonym_map[token1] = set()
                synonym_map[token1].add(token2)
    return synonym_map

embedding_synonym_map = find_embedding_similarities(unique_tokens, model, similarity_threshold=0.7)

# Combine fuzzy and embedding synonym maps
combined_synonym_map = {}
for token in unique_tokens:
    combined_set = set()
    if token in fuzzy_synonym_map:
        combined_set.update(fuzzy_synonym_map[token])
    if token in embedding_synonym_map:
        combined_set.update(embedding_synonym_map[token])
    if combined_set:
        combined_synonym_map[token] = combined_set

# Create a mapping from each token to a representative token (choose the most frequent or shortest)
def get_representative_token(token, synonym_map, token_freq):
    if token not in synonym_map:
        return token
    synonyms = synonym_map[token]
    synonyms.add(token)
    # Choose the shortest token as representative for simplicity
    return min(synonyms, key=len)

# Calculate token frequencies for representative choice
token_freq = {}
for desc in cleaned_descriptions:
    for token in desc.split():
        token_freq[token] = token_freq.get(token, 0) + 1

# Build final token to representative mapping
token_to_rep = {}
for token in unique_tokens:
    rep = get_representative_token(token, combined_synonym_map, token_freq)
    token_to_rep[token] = rep

# Apply synonym normalization to descriptions
def normalize_description(desc, token_map):
    tokens = desc.split()
    normalized_tokens = [token_map.get(token, token) for token in tokens]
    return ' '.join(normalized_tokens)

normalized_descriptions = [normalize_description(desc, token_to_rep) for desc in cleaned_descriptions]

# Step 3: Vectorize normalized descriptions
embeddings = model.encode(normalized_descriptions)

# Step 4: Determine optimal K using elbow method
def find_optimal_k(embeddings, max_k=50):
    distortions = []
    k_range = range(2, max_k+1)
    for k in k_range:
        kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1000)
        kmeans.fit(embeddings)
        distortions.append(kmeans.inertia_)
    knee_locator = KneeLocator(k_range, distortions, curve='convex', direction='decreasing')
    return knee_locator.knee

optimal_k = find_optimal_k(embeddings, max_k=50)
if optimal_k is None:
    optimal_k = 20  # Default if knee not found

# Step 5: Apply MiniBatchKMeans clustering
kmeans = MiniBatchKMeans(n_clusters=optimal_k, random_state=42, batch_size=1000)
cluster_labels = kmeans.fit_predict(embeddings)
centroids = kmeans.cluster_centers_

# Step 6: Post-processing for singletons
cluster_sizes = pd.Series(cluster_labels).value_counts()
singleton_clusters = cluster_sizes[cluster_sizes <= 2].index.tolist()

# For each singleton, check similarity to nearest large cluster
new_labels = cluster_labels.copy()
for singleton in singleton_clusters:
    singleton_indices = np.where(cluster_labels == singleton)[0]
    if len(singleton_indices) == 0:
        continue
    singleton_embedding = embeddings[singleton_indices[0]].reshape(1, -1)
    # Get all non-singleton clusters
    large_clusters = [c for c in range(optimal_k) if c not in singleton_clusters]
    if not large_clusters:
        continue
    large_centroids = centroids[large_clusters]
    similarities = cosine_similarity(singleton_embedding, large_centroids)
    max_similarity = np.max(similarities)
    nearest_cluster = large_clusters[np.argmax(similarities)]
    if max_similarity >= 0.9:  High threshold to avoid irrelevant merges
        new_labels[singleton_indices] = nearest_cluster
    else:
        # Keep as singleton
        pass

# Update cluster labels after post-processing
cluster_labels = new_labels

# Step 7: Generate cluster names using c-TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_cluster_names(descriptions, cluster_labels):
    unique_clusters = np.unique(cluster_labels)
    cluster_names = {}
    for cluster in unique_clusters:
        cluster_descriptions = [descriptions[i] for i in range(len(descriptions)) if cluster_labels[i] == cluster]
        if not cluster_descriptions:
            cluster_names[cluster] = "Unknown"
            continue
        vectorizer = TfidfVectorizer(stop_words='english', max_features=5)
        try:
            tfidf_matrix = vectorizer.fit_transform(cluster_descriptions)
            feature_names = vectorizer.get_feature_names_out()
            scores = tfidf_matrix.sum(axis=0).A1
            top_features = [feature_names[i] for i in np.argsort(scores)[-3:][::-1]]
            cluster_name = "_".join(top_features)
        except:
            cluster_name = " ".join(cluster_descriptions[0].split()[:3])
        cluster_names[cluster] = cluster_name
    return cluster_names

cluster_names = generate_cluster_names(descriptions, cluster_labels)

# Step 8: Assign proposedkey and cluster to dataframe
df_fin['proposedkey'] = [cluster_names[label] for label in cluster_labels]
df_fin['cluster'] = cluster_labels

# Step 9: Save to output CSV
df_fin.to_csv('output.csv', index=False)

print("Processing complete. Output saved to output.csv")