In [None]:
!pip install pandas scikit-learn sentence-transformers ipywidgets tqdm


In [4]:
# STEP 1: Imports
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from tqdm import tqdm

# STEP 2: Read CSV
df = pd.read_csv("sample_data.csv")  # change path as needed

# STEP 3: Clean and Standardize closure_summary_alt
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    return text.strip()

df['standardized_summary'] = df['closure_summary_alt'].fillna("").apply(clean_text)

# STEP 4: Generate Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

print("Generating embeddings...")
embeddings = model.encode(df['standardized_summary'].tolist(), show_progress_bar=True)
embeddings = normalize(embeddings)

# STEP 5: Cluster Embeddings (KMeans)
n_clusters = 8
print(f"Clustering into {n_clusters} groups...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['semantic_cluster'] = kmeans.fit_predict(embeddings)

# STEP 6: Show Sample Summaries from Each Cluster
for cluster in range(n_clusters):
    print(f"\nCluster {cluster}")
    samples = df[df['semantic_cluster'] == cluster]['standardized_summary'].head(3)
    for i, summary in enumerate(samples):
        print(f"  {i+1}: {summary}")



Generating embeddings...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Clustering into 8 groups...

Cluster 0
  1: 4x 100ah battery replacement complete
  2: 4x 100ah battery replacement complete
  3: 4x 100ah battery replacement complete

Cluster 1
  1: rcd was tripped upon arrival and after resetting the csd was still off  upon further investigation the fmps was faulty a storm recently hit the area  fmps was replaced with mrn supplied parts  services confirmed restored  noc was also advised to send out sdp electrician to replace the rcd as the âtest trip buttonâ does not trip the circuit  ashish kosta  ashish kumar from noc advised and approved offsite
  2: found that the fmps still had 238v but it was making a ticking a sound the output voltage was 913vdc i believe the fmps has failed and will need replacing
  3: on arrival found power to node 2475vac also found chattering fmps relay attempted to reset relay multiple times without success faulty fmps unit requires replacement

Cluster 2
  1: 10040 cable damaged by lightning activityhaul new 10 pr tempo

In [5]:
# Check how your semantic clusters align with manual categories
import pandas as pd
pd.crosstab(df['semantic_cluster'], df['Categorisation'])


Categorisation,AC Mains Issues (Electrician),CB Reset,DPU And Additional Equipment Replaced,DPU Replaced,DPU Replaced,FMPS Replaced,Faulty FMPS units causing power issues,Faulty FMPS units causing power issues (MRN Required),Rectifier Issue (Resolved),Replaced Battery,Tech Not Skilled,Temp Fix,Third Party Power
semantic_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,0,0,0,0,0,0,0,0,11,0,0,0
1,0,0,0,0,0,3,1,4,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,2,0
3,6,1,0,0,0,0,0,0,1,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,1,3,0
5,0,0,0,1,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,3,0
7,0,0,0,0,0,0,0,0,0,6,0,0,0


In [6]:
label_vs_cluster = pd.crosstab(df['Categorisation'], df['semantic_cluster'])
print(label_vs_cluster)


semantic_cluster                                     0  1  2  3  4  5  6  7
Categorisation                                                             
AC Mains Issues (Electrician)                        0  0  0  6  0  0  0  0
CB Reset                                             0  0  0  1  0  0  0  0
DPU And Additional Equipment Replaced                0  0  1  0  0  0  0  0
DPU Replaced                                         0  0  0  0  0  1  0  0
DPU Replaced                                         0  0  0  0  0  1  0  0
FMPS Replaced                                        0  3  0  0  0  0  0  0
Faulty FMPS units causing power issues               0  1  0  0  0  0  0  0
Faulty FMPS units causing power issues (MRN Req...   0  4  0  0  0  0  0  0
Rectifier Issue (Resolved)                           0  0  0  1  0  0  0  0
Replaced Battery                                    11  0  0  0  0  0  0  6
Tech Not Skilled                                     0  0  0  0  1  0  0  0
Temp Fix    

In [7]:
def cluster_purity(df, label_col='Categorisation', cluster_col='semantic_cluster'):
    purity_report = []
    for label in df[label_col].dropna().unique():
        subset = df[df[label_col] == label]
        counts = subset[cluster_col].value_counts()
        majority_cluster = counts.idxmax()
        purity = counts.max() / counts.sum()
        purity_report.append({
            'Label': label,
            'Total Samples': counts.sum(),
            'Majority Cluster': majority_cluster,
            'Purity': round(purity, 2)
        })
    return pd.DataFrame(purity_report).sort_values(by='Purity', ascending=False)

# Run it
purity_df = cluster_purity(df)
print(purity_df)


                                                Label  Total Samples  \
0                                      FMPS Replaced               3   
1   Faulty FMPS units causing power issues (MRN Re...              4   
3                                        DPU Replaced              1   
4              Faulty FMPS units causing power issues              1   
5                          Rectifier Issue (Resolved)              1   
12                                  Tech Not Skilled               1   
7                                       DPU Replaced               1   
8               DPU And Additional Equipment Replaced              1   
9                       AC Mains Issues (Electrician)              6   
10                                           CB Reset              1   
6                                    Replaced Battery             17   
11                                 Third Party Power               2   
2                                           Temp Fix            