In [None]:
FILE_PATH ="/content/Data_Ai_and_py.txt"

In [None]:
! pip install transformers
! pip install sentence_transformers

### load libraries

In [None]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import plotly.express as px

### Read the data

In [None]:
with open (FILE_PATH,"rb")as f:
  Data =f.read()

### Convert Datatype from string to list

In [None]:
# Convert bytes to string
data_str = Data.decode('utf-8')

# Split the string into a list based on some delimiter
data_list = data_str.split(',')


### upload Embedding for Hugging Face

In [None]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

### convert Data to Embedding vectors and Normalize the embeddings

In [None]:
corpus_embeddings = embedder.encode(data_list)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

### Apply model K -Means

In [None]:
clustering_model = KMeans(n_clusters=2)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)



[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0
 0 1 1 1 1 1 1 0 0 0 0 0 

### drowing the clusters

In [None]:
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(corpus_embeddings)

# Create a DataFrame with the embeddings and cluster assignments
df = pd.DataFrame(embeddings_2d, columns=['python developer', 'Ai'])
df['Cluster'] = cluster_assignment

# Plotting the clusters
fig = px.scatter(df, x='python developer', y='Ai', color='Cluster',
                 title='KMeans Clustering', opacity=0.7, labels={'Cluster': 'Cluster'})
fig.show()

### convert the results to Dictionary

In [None]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(data_list[sentence_id])

###  Flatten the clustered_sentences dictionary and Create DataFrame from the flattened list

In [None]:

flattened_sentences = [(cluster_id, sentence) for cluster_id, sentences in clustered_sentences.items() for sentence in sentences]


df = pd.DataFrame(flattened_sentences, columns=['Cluster_id', 'Sentence'])


In [None]:
df.sample(1).iloc[0]

Cluster_id                                                    0
Sentence      Pandas is an open-sourced library used in data...
Name: 671, dtype: object

### convert numercial to category

In [None]:
# Define the mapping dictionary
mapping = {
    0: "python developer",
    1: "AI"
}

# Replace the values in the "Cluster_id" column with their corresponding descriptions
df["Cluster_id"] = df["Cluster_id"].map(mapping).fillna(df["Cluster_id"])

In [None]:
df.tail()

In [None]:
df.to_csv("Clustering.csv")