In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

# Defina o caminho do arquivo CSV
caminho_arquivo = r'C:\Users\Miguel\PycharmProjects\lpiproject2024\DataSet.csv'

# Ler o arquivo CSV, pulando a primeira linha, que é o cabeçalho da tabela
df = pd.read_csv(caminho_arquivo, delimiter=';', header=1)

# Acessar a coluna "DescricaoProb" diretamente pelo nome
strings_descricao_prob = df['DescricaoProb'].values
strings_como_reproduzir = df['ComoReproduzir'].values

# strings_descricao_prob é agora um array NumPy contendo os valores da coluna "DescricaoProb"
#print(strings_descricao_prob)
#print(strings_como_reproduzir)

# Embedding model
model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
embeddings_descricao_prob = model.encode(strings_descricao_prob)
#embeddings_como_reproduzir = model.encode(strings_como_reproduzir)


# Defining the range of clusters to try
min_clusters = 3
max_clusters = 5

# Initializing lists to store silhouette scores
silhouette_scores = []

# Loop over different numbers of clusters
for k in range(min_clusters, max_clusters + 1):
    # Initialize KMeans with k clusters
    kmeans = KMeans(n_clusters=k, random_state=42)

    # Fit KMeans to the embeddings
    kmeans.fit(embeddings_descricao_prob)

    # Obtain cluster labels
    cluster_labels = kmeans.labels_

    # Calculate silhouette score
    silhouette = silhouette_score(embeddings_descricao_prob, cluster_labels)

    # Append silhouette score to list
    silhouette_scores.append((k, silhouette))

# Plot silhouette scores
plt.plot(*zip(*silhouette_scores))
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette score')
plt.title('Silhouette scores for K-means clustering')
plt.show()

# Choose the number of clusters with highest silhouette score
best_k, _ = max(silhouette_scores, key=lambda x: x[1])
print("Best number of clusters:", best_k)

# Initialize KMeans with best number of clusters
best_kmeans = KMeans(n_clusters=best_k, random_state=42)
best_kmeans.fit(embeddings_descricao_prob)
cluster_labels = best_kmeans.labels_

# Perform t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42, perplexity=5)  # Modifique perplexity para um valor adequado
tsne_embeddings = tsne.fit_transform(embeddings_descricao_prob)


# Visualize clusters
plt.figure(figsize=(10, 8))
plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1], c=best_kmeans.labels_, cmap='viridis')
plt.title('t-SNE embeddings for clustering')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(label='Cluster')
plt.show()