In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# Load the BERT model for sentence embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

file_path = 'clean_df.csv'

database = pd.read_csv(file_path)

database['Abstract'] = database['Abstract'].fillna("").astype(str)
database['Title'] = database['Title'].fillna("").astype(str)

database = database[database['Abstract'] != ""]
database = database[database['Title'] != ""]

database['Combined'] = database['Title'] + ". " + database['Abstract']

# Function to get the embeddings for a list of texts
def get_embeddings(texts):
    return np.array(model.encode(texts, show_progress_bar=True))

combined_embeddings = get_embeddings(database['Combined'].tolist())

num_clusters = 10

kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(combined_embeddings)

cluster_labels = kmeans.labels_

database['Cluster'] = cluster_labels

  database = pd.read_csv(file_path)


Batches:   0%|          | 0/1313 [00:00<?, ?it/s]



In [2]:
database.to_csv('clustered_data.csv', index=False)

In [3]:
# Test text

new_text = "Understanding the Workload of Remote Truck Operators with Discrete Event Simulation"
new_text_embedding = get_embeddings([new_text])

# Determine the most suitable cluster
cluster_centers = kmeans.cluster_centers_
similarities = cosine_similarity(new_text_embedding, cluster_centers)
most_similar_cluster = np.argmax(similarities)

print(f"The most close cluster is: {most_similar_cluster}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The most close cluster is: 8


In [4]:
# Calculate similarity to all embeddings in the database
similarities_to_all = cosine_similarity(new_text_embedding, combined_embeddings).flatten()

# Get the five indexes with the highest similarity
top_five_indices = np.argsort(similarities_to_all)[-5:]

top_five_titles = database.iloc[top_five_indices]['Title']
top_five_clusters = database.iloc[top_five_indices]['Cluster']

print(f"The titles of the most similar contents to the given text, their titles, and clusters are:")
for title, cluster in zip(top_five_titles, top_five_clusters):
    print(f"- {title} (Cluster {cluster})")

The titles of the most similar contents to the given text, their titles, and clusters are:
- Automation and the situation awareness of drivers in agricultural semi-autonomous vehicles (Cluster 1)
- A case study - Characteristics of work organization in lean production and sociotechnical systems (Cluster 8)
- An Adaptive Work Study Method for Identifying the Human Factors that Influence the Performance of a Human-Machine System (Cluster 8)
- WORK MOTION STUDY OF PIVOT TYPE TRAILER OPERATION ON TWO WHEEL TRACTORS (Cluster 6)
- Human-Machine Interface System for Simulation-based Automatic Platooning of Trucks (Cluster 8)


In [5]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Silhouette Score
silhouette_avg = silhouette_score(combined_embeddings, cluster_labels)
print("Silhouette Score: ", silhouette_avg)

# Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(combined_embeddings, cluster_labels)
print("Calinski-Harabasz Index: ", calinski_harabasz)

# Davies-Bouldin Index
davies_bouldin = davies_bouldin_score(combined_embeddings, cluster_labels)
print("Davies-Bouldin Index: ", davies_bouldin)

Silhouette Score:  0.039617468
Calinski-Harabasz Index:  1143.9726541427974
Davies-Bouldin Index:  3.42047431603511


In [7]:
database

Unnamed: 0.1,Unnamed: 0,Title,Authors,Abstract,year,month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,paper,Combined,Cluster
0,1,Development and validation of a discretised mu...,Huynh KT; Gibson I; Jagdish BN; Lu WF,This paper presents a discretised musculoskele...,2015.0,,Comput Methods Biomech Biomed Engin,18,2,175-84,,10.1080/10255842.2013.786049,23621475.0,#1,Huynh 2015,1,Development and validation of a discretised mu...,6
1,2,Gender differences in prevalence of musculoske...,Das B,BACKGROUND: Musculoskeletal disorder is one of...,2015.0,Jan,Work,50,2,229-40,,10.3233/WOR-131694,24004755.0,#2,Das 2015,2,Gender differences in prevalence of musculoske...,1
2,3,The impact of ergonomics intervention on trunk...,Afshari D; Motamedzade M; Salehi R; Soltanian AR,BACKGROUND: Work-related musculoskeletal disor...,2015.0,Jan,Work,50,2,241-8,,10.3233/WOR-131701,24004757.0,#3,Afshari 2015,3,The impact of ergonomics intervention on trunk...,4
3,4,An ergonomic approach for designing indian tra...,Dhara PC; De S; Sengupta P; Maity P; Pal A,BACKGROUND: In India varieties of hand tools h...,2015.0,Jan,Work,50,2,177-86,,10.3233/WOR-131721,24004779.0,#4,Dhara 2015,4,An ergonomic approach for designing indian tra...,9
4,5,Effects of posture-related auditory cueing (PA...,Yoo WG; Park SY,BACKGROUND: The etiology of the neck and back ...,2015.0,Jan,Work,50,2,187-91,,10.3233/WOR-131738,24004794.0,#5,Yoo 2015,5,Effects of posture-related auditory cueing (PA...,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41981,41982,ANTHROPOMETRY OF THE ELDERLY - STATUS AND RECO...,"KELLY, PL; KROEMER, KHE",Available anthropometric information on the el...,1990.0,,HUMAN FACTORS,32,5,571-595,WOS:A1990ER80000006,10.1177/001872089003200506,,#60350,KELLY 1990,47831,ANTHROPOMETRY OF THE ELDERLY - STATUS AND RECO...,7
41982,41983,A COMPUTERIZED CONTROL ROOM TO IMPROVE NUCLEAR...,"REYNES, L; BELTRANDA, G","After the Three Mile Island accident, Electric...",1990.0,,NUCLEAR SAFETY,31,4,504-513,WOS:A1990FQ93700007,,,#60352,REYNES 1990,47833,A COMPUTERIZED CONTROL ROOM TO IMPROVE NUCLEAR...,9
41983,41984,SUPERCHARGING OF HIGH-SPEED DIESEL-ENGINES - R...,"SKIFIC, N",This paper presents a research into optimum su...,1990.0,,STROJARSTVO,32,5,331-339,WOS:A1990ER39200003,,,#60353,SKIFIC 1990,47834,SUPERCHARGING OF HIGH-SPEED DIESEL-ENGINES - R...,6
41984,41985,A MATHEMATICAL-MODEL OF OPTIMAL CRUTCH WEIGHT ...,"PRADHAN, AK; ROY, AB; SAHA, SC; THAKUR, S; KAR...",From clinical Symptomatology it appears that l...,1990.0,,MATHEMATICAL AND COMPUTER MODELLING,14,,274-278,WOS:A1990EQ46900052,10.1016/0895-7177(90)90190-X,,#60488,PRADHAN 1990,47969,A MATHEMATICAL-MODEL OF OPTIMAL CRUTCH WEIGHT ...,0
