In [2]:
from servicerequestslib.preprocessing import preprocessingSR

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('tests/fixtures/miniSRdf.csv')
df_s = df[['u_case_number','u_problem_description','u_solution_description','u_expert_assessment_descriptio','u_first_reply_solution']]

In [5]:
df.shape

(500, 190)

In [6]:
columns_dict = {"C_CASE_NUMBER": 'u_case_number',
                "C_DESCRIPTION": 'u_problem_description',
                "C_SOLUTION": 'u_solution_description',
                "C_EAD": 'u_expert_assessment_descriptio', 
                "C_FR": 'u_first_reply_solution'}
transformed_df = preprocessingSR(df, **columns_dict)

In [7]:
transformed_df =transformed_df.set_index('u_case_number',drop=True)

In [8]:
import time

embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [9]:
start = time.time()
sentences = transformed_df['_TEXT_LONG'].to_list()
embeddings = embedder.encode(sentences)
#emb_df = pd.DataFrame(embeddings, index=df['u_case_number'])
print(time.time()-start)

4.362789869308472


In [10]:
len(sentences)

500

## Clustering Technique -1

In [11]:
# normalize the embeddings to unit length
normalized_embeddings = embeddings / np.linalg.norm(embeddings,axis=1,keepdims=True)


In [26]:
clustering_model_agg = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model_agg.fit(normalized_embeddings)
cluster_assignment_agg = clustering_model_agg.labels_

In [30]:
distances_agg = clustering_model_agg.distances_
print(min(distances_agg),max(distances_agg))

0.4821590502056571 4.501731220736382


In [27]:
cluster_list_agg = list(cluster_assignment_agg)

In [28]:
max(cluster_list_agg)

57

In [29]:
from collections import Counter
Counter(cluster_list_agg)

Counter({26: 5,
         4: 8,
         21: 8,
         15: 19,
         8: 12,
         7: 13,
         0: 13,
         56: 7,
         3: 14,
         41: 9,
         6: 13,
         39: 8,
         28: 9,
         44: 8,
         2: 13,
         19: 15,
         5: 11,
         43: 11,
         12: 12,
         11: 11,
         34: 7,
         1: 21,
         55: 6,
         23: 12,
         22: 9,
         37: 7,
         24: 7,
         40: 6,
         38: 6,
         29: 8,
         14: 8,
         20: 7,
         42: 5,
         16: 10,
         35: 4,
         18: 9,
         9: 11,
         45: 4,
         46: 9,
         52: 7,
         54: 9,
         32: 12,
         10: 11,
         51: 7,
         48: 4,
         36: 4,
         30: 11,
         27: 7,
         17: 6,
         31: 6,
         57: 5,
         25: 11,
         50: 4,
         13: 7,
         47: 6,
         33: 3,
         53: 2,
         49: 3})

In [16]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []
        
        
    clustered_sentences[cluster_id].append(sentences[sentence_id])
        

In [17]:
clustered_sentences

{26: ['TK2 TRIP LOG UPDATE ISSUE. The reason behind the error is because of the number of signals added in the Capture Block, which is actually exceeding the controller memory limit. Normally based on the controller memory and some tests performed with Capture block we could add approx. 260 signals (4 digital signals = 1 analog signal). [This number is not constant and may vary depending on the memory and affects the processing time for controller]. . Now to avoid the error, it is required to reduce the number of signals configured in the Capture block. We have observed that in TK2 some more Capture Block is added (in AUX and LOAD module) (compared to TK1) which contains other process signals as well.We have to select the critical signals among all and configure same. The reason behind the error is because of the number of signals added in the Capture Block, which is actually exceeding the controller memory limit. Normally based on the controller memory and some tests performed with Ca

In [18]:
for i, cluster in clustered_sentences.items():
    print("cluster",i+1)
    print(cluster)
    print('***********************************')


cluster 27
['TK2 TRIP LOG UPDATE ISSUE. The reason behind the error is because of the number of signals added in the Capture Block, which is actually exceeding the controller memory limit. Normally based on the controller memory and some tests performed with Capture block we could add approx. 260 signals (4 digital signals = 1 analog signal). [This number is not constant and may vary depending on the memory and affects the processing time for controller]. . Now to avoid the error, it is required to reduce the number of signals configured in the Capture block. We have observed that in TK2 some more Capture Block is added (in AUX and LOAD module) (compared to TK1) which contains other process signals as well.We have to select the critical signals among all and configure same. The reason behind the error is because of the number of signals added in the Capture Block, which is actually exceeding the controller memory limit. Normally based on the controller memory and some tests performed w

## Clustering Technique -2

In [19]:
from sklearn.cluster import KMeans

In [20]:
num_clusters = 50
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(normalized_embeddings)
cluster_assignment = clustering_model.labels_

In [21]:
cluster_list = list(cluster_assignment)
print(max(cluster_list))

from collections import Counter
Counter(cluster_list)

49


Counter({35: 12,
         27: 6,
         18: 20,
         36: 12,
         15: 32,
         30: 14,
         14: 17,
         49: 12,
         4: 22,
         31: 12,
         1: 12,
         11: 6,
         9: 28,
         17: 6,
         16: 14,
         23: 6,
         6: 11,
         29: 16,
         41: 10,
         26: 12,
         10: 7,
         45: 10,
         5: 9,
         24: 12,
         8: 9,
         7: 10,
         28: 9,
         3: 3,
         47: 9,
         25: 8,
         12: 4,
         44: 5,
         32: 8,
         13: 7,
         33: 12,
         39: 10,
         20: 2,
         0: 8,
         19: 13,
         46: 4,
         43: 6,
         48: 13,
         34: 7,
         22: 7,
         38: 10,
         21: 6,
         37: 5,
         40: 3,
         2: 3,
         42: 1})

In [23]:
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
["During loop check site team found that the valve 30MAE10AA160-Y was found not operating .. Masoneilan 27-18162MS ( RVO6835081. The positioner is energized , the indication led's are on showing normal operating .. The autocalibration activated by the push button is accepted but the valve don't move . after few minutes the error led turn on and autocalibration failed .. . SR to be closed, issue solved, mail attached", 'A new axial compressor antisurge valve AV-1 has been installed on TC2, valve pn RVO44755 received with box GS3036584.. Due to different type of hytork actuator (old type 175-81; new type 1127) has been necessary to modify the pneumatic hook-up as per attached sketch. After the installation the valve has been tested with instrument air and is fully functional. Attached some pictures of the old and new actuator, as you can see the new actuator doesnt have the 1/2-inch npt ports on both sides.. It is necessary to restore the design condition as indicated in the P