RETO 2 APAU - GRUPO 8

# Importación Librerias

In [94]:
import pandas as pd
import csv
import re
from string import punctuation
from nltk.corpus import stopwords
import unicodedata
from collections import Counter
import emoji

# Libreria para vectorizar texto
from sklearn.feature_extraction.text import TfidfVectorizer

# Librerias para el clustering

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Librerias para reduccion de dimensionalidad
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
import numpy as np
from collections import Counter

from sklearn.cluster import DBSCAN



# Funciones Utilizadas

In [29]:
# Convertimos a minusculas, sacamos enlaces etc...
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"\b[0-9]+\b\s*", "", text)
    text = re.sub(r"@[\w]+", "", text)  # Eliminar menciones de usuario
    text = re.sub(r"[^a-zA-Záéíóúüñ\s]", "", text)  # Eliminar caracteres no alfabéticos
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # Remove punctuation
    text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines
    return text

# Función para eliminar stopwords
def remove_stopwords(text):
    # Convertir texto a minúsculas y eliminar acentos
    text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('utf-8')
    return " ".join([word for word in str(text).lower().split() if word not in STOPWORDS])

def convert_emoji_to_text(emoji_text):
    text_with_aliases = emoji.demojize(emoji_text, delimiters=(":", ":"))
    return text_with_aliases



def Plot2D (samples_2D, title, axes = {'x': 'x', 'y': 'y'}):

  """
  samples_2D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 2) ; 2 por las dos coordenadas

  title(string): titulo de la figura que se utiliza para el plot
  """

  df_samples_2D = pd.DataFrame(data=samples_2D, columns=[axes['x'], axes['y']])

  sns.set(font_scale=3)
  sns.set(rc={'figure.figsize':(10,10)})
  sns.relplot(data=df_samples_2D,
              x=axes['x'],
              y=axes['y'],
              height=10, legend="full", palette="bright")

  set_size_letters(title,
                   axes['x'],
                   axes['y'],
                   active_legend = False)
  plt.axis('equal')

def set_size_letters(title, x_name, y_name, title_size = 20, x_size = 18, y_size = 18, active_legend = True, legend_size = 14):

  """
  Parameters:
    title (string): titulo del plot a representar

    x_name (string): nombre del eje x

    y_name (string): nombre del eje y

    active_legend (bool): indica si mostramos la leyenda o no. Por defecto True

    x_size, y_size, legend_size: tamaño de fuente de eje x, y, leyenda

  """

  plt.title(title, fontsize=title_size)
  plt.xlabel(x_name, fontsize=x_size)
  plt.ylabel(y_name, fontsize=y_size)
  if (active_legend == True):
    plt.legend(fontsize=legend_size)

def Plot3D_WithLabels (samples_3D, labels, title, axes = {'x': 'x', 'y': 'y', 'z': 'z'}, centroids_3D=None):

  """
  samples_3D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 3) ; 3 por las tres coordenadas

  labels (array): etiqueta perteneciente a cada muestra.

  title(string): titulo de la figura que se utiliza para el plot
  """

  # First we create the dataframe
  df_samples_3D_labeled = pd.DataFrame(data=samples_3D, columns=[axes['x'], axes['y'], axes['z']])

  # Then we add the labels column
  df_samples_3D_labeled['label'] = labels.tolist()
  df_samples_3D_labeled['label'] = df_samples_3D_labeled["label"].astype(str)

  if centroids_3D is not None:
    # Same applies for centroids when these are provided as an argument
    labels_centroid_aux = np.arange(len(centroids_3D), dtype=int)
    df_centroids_3D_labeled = pd.DataFrame(data=centroids_3D, columns=[axes['x'], axes['y'], axes['z']])
    df_centroids_3D_labeled['label'] = labels_centroid_aux.tolist()

    # We create an additional column with the dot size used for each type of sample
    size_no_centroid = np.ones(len(samples_3D)) * 10 # for regular samples
    size_centroid = np.ones(len(centroids_3D)) * 50 # for centroids
    size_col = np.append(size_no_centroid, size_centroid) # new col to be added to the dataframe

    # We also create another additional column with the labels for each type of sample
    no_es_centroide_aux = [' '] * len(samples_3D) # empty label for regular samples
    es_centroide_aux = []
    for i in range(len(centroids_3D)):
      es_centroide_aux.append('C%d' % i) # Ci label for centroid i
    centroid_col = no_es_centroide_aux + es_centroide_aux # new col to be added to the dataframe

    # Next we concatenate both dataframes: first, regular samples, then, centroids
    df_samples_and_centroids = pd.concat([df_samples_3D_labeled, df_centroids_3D_labeled], ignore_index=True)
      

    # We add the new column with the labels distinguishing regular samples from centroids
    df_samples_and_centroids['centroid'] = centroid_col

    # New column is re-casted as a string column
    df_samples_and_centroids['centroid'] = df_samples_and_centroids['centroid'].astype(str)

    # We add the new column with the corresponding size for both regular samples and centroids
    df_samples_and_centroids['size'] = size_col

    # We ensure that the 'label' column is numeric since we will sort the dataframe upon this one
    df_samples_and_centroids['label'] = pd.to_numeric(df_samples_and_centroids['label'])

    # We finally sort the dataframe by the 'label' column in ascending order
    df_samples_and_centroids_sorted = df_samples_and_centroids.sort_values(by=['label'], ascending=True)

    # And plot both the samples and their corresponding centroids
    fig = px.scatter_3d(df_samples_and_centroids_sorted, x=axes['x'], y=axes['y'], z=axes['z'], text='centroid', size='size', color='label')
  else:
    # We ensure that the 'label' column is numeric since we will sort the dataframe upon this one
    df_samples_3D_labeled['label'] = pd.to_numeric(df_samples_3D_labeled['label'])

    # We finally sort the dataframe by the 'label' column in ascending order
    df_samples_and_centroids_sorted = df_samples_3D_labeled.sort_values(by=['label'], ascending=True)

    fig = px.scatter_3d(df_samples_and_centroids_sorted, x=axes['x'], y=axes['y'], z=axes['z'], color='label', size=np.ones(len(samples_3D))) #, color_continuous_scale='delta')

  fig.update_traces(textposition='top center')
  fig.update_layout(scene_aspectmode='data')
  fig.update_layout(uniformtext_minsize=60)

  fig.update_layout(title_font_size=20,
                    title={
                    'text': title,
                    'y': 0.9,
                    'x': 0.05,
                    'xanchor': 'left',
                    'yanchor': 'top'})
  # tight layout
  fig.update_layout(autosize = True, margin = dict(l=50, r=0, b=10, t=30))
  fig.show()


# Función para obtener los términos más representativos por clúster
def get_top_terms_by_cluster(clustered_tweets, top_n=5):
    cluster_topics = {}
    for cluster in clustered_tweets['label'].unique():
        # Filtrar los tweets del clúster actual
        cluster_tweets = clustered_tweets[clustered_tweets['label'] == cluster]['tweet']

        # Tokenizar palabras
        all_words = " ".join(cluster_tweets).split()

        # Contar frecuencia de palabras
        word_counts = Counter(all_words)

        # Obtener las 'top_n' palabras más frecuentes
        cluster_topics[cluster] = word_counts.most_common(top_n)
    return cluster_topics

def plot_2D_centroid_labels(df_centroids, ax):
    for index, row in df_centroids.iterrows():
        ax.text(row[0], row[1], 'C'+str(row[2].astype(int)), 
                fontsize=20, color='black', weight='semibold')
        

# Función para obtener los términos más representativos por clúster
def get_top_terms_by_cluster(clustered_tweets, top_n=5):
    cluster_topics = {}
    for cluster in clustered_tweets['label'].unique():
        # Filtrar los tweets del clúster actual
        cluster_tweets = clustered_tweets[clustered_tweets['label'] == cluster]['tweet']

        # Tokenizar palabras
        all_words = " ".join(cluster_tweets).split()

        # Contar frecuencia de palabras
        word_counts = Counter(all_words)

        # Obtener las 'top_n' palabras más frecuentes
        cluster_topics[cluster] = word_counts.most_common(top_n)
    return cluster_topics


def Plot2D_WithLabels (samples_2D, labels, title, axes = {'x': 'x', 'y': 'y'}, palette="bright", centroids_2D=None):

  """
  samples_2D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 2) ; 2 por las dos coordenadas

  labels (array): etiqueta perteneciente a cada muestra.

  title(string): titulo de la figura que se utiliza para el plot
  """

  # First we create the dataframe
  df_samples_2D_labeled = pd.DataFrame(data=samples_2D, columns=[axes['x'], axes['y']])

  # Then we add the labels column
  df_samples_2D_labeled['label'] = labels.tolist()

  if centroids_2D is not None:
    labels_centroid_aux = np.arange(len(centroids_2D), dtype=int)
    df_centroids_2D_labeled = pd.DataFrame(data=centroids_2D, columns=[axes['x'], axes['y']])
    df_centroids_2D_labeled['label'] = labels_centroid_aux.tolist()

  sns.set(font_scale=3)
  sns.set(rc={'figure.figsize':(10,10)})
  sns.relplot(data=df_samples_2D_labeled,
              x=axes['x'],
              y=axes['y'],
              hue="label", height=10, legend="full", palette=palette)

  if centroids_2D is not None:
    # First we plot the centroids
    sns.scatterplot(data=df_centroids_2D_labeled,
              x=axes['x'],
              y=axes['y'],
              hue="label",
              legend=False, palette=palette, s=100)

    # Then we plot their labels
    plot_2D_centroid_labels(df_centroids_2D_labeled, plt.gca())

  set_size_letters(title,
                   axes['x'],
                   axes['y'],
                   active_legend = False)
  plt.axis('equal')
    
    

def ApplyDBScanToData (samples, epsilon, min_samples = 30):

  """
  Parameters:

    samples (numpy.ndarray): array tipo embedding cuyo
        shape es (n_ejemplo,n_muestras_por_ejemplo)

    epsilon: int con el valor de epsilon (distancia mínima entre elementos
        para formar un cluster)

    min_samples (int): numero minimo de muestras para generar un cluster

    include_noise (bool): permite seleccionar si se quiere incluir el cluster
        de ruido en el cálculo de la silueta o no. Por defecto no se incluye.
        Se debe incluir cuando únicamente se detecta un cluster, para poder
        realizar la evaluación.

  Return:

    labels: lista con el cluster al que pertenece cada ejemplo de data. Cluster
        -1 significa ruido.
  """


  db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(samples)

  core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  core_samples_mask[db.core_sample_indices_] = True
  labels = db.labels_

  # Number of clusters in labels, ignoring noise if present.
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  n_noise_ = list(labels).count(-1)

  print(f"\nTest for epsilon = {epsilon}")
  print('Estimated number of clusters: %d' % n_clusters_)
  print('Estimated number of noise points: %d' % n_noise_)
  print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(samples, labels))


  return labels


def PlotDistancesToKnearestNeighbor(data_vector, K):

  """
  Parameters:
    data_vector (numpy.ndarray): array tipo embedding cuyo
        shape es (n_ejemplo,n_muestras_por_ejemplo)

    k: posición del k-esimo vecino más cercano

  """

  nbrs = NearestNeighbors(n_neighbors=K).fit(data_vector)
  distances, indices = nbrs.kneighbors(data_vector)
  distances = np.sort(distances, axis=0)
  distances = distances[:,K-1]
  plt.figure(figsize=(10,8))
  set_size_letters(f"Distancias al K-vecino más cercano (K={K})",
                   f"Points sorted according to distance of the {K}-th nearest neighbor",
                   f"{K}-th nearest neighbor distance")
  plt.plot(distances)


# 1. Archivo APAUtweets.txt

## 1.1 Carga de Datos

In [None]:
dataset_tweet = pd.read_csv("APAUtweets.txt", header=None, delimiter = "\t", encoding = 'utf8', quoting=csv.QUOTE_NONE)
dataset_tweet.set_index(0, inplace = True)
dataset_tweet.rename(columns={1: "text"}, inplace = True)
dataset_tweet

## 1.2 Limpiar y procesar dataset

In [35]:
dataset_tweet["text_lower"] = dataset_tweet["text"].map(preprocess_text) # Se aplica la función preprocess_text a la columna text

## 1.3 Borramos palabras tipicas

In [None]:
# Lista de stopwords en español
STOPWORDS = set(stopwords.words('spanish'))
# Aplicar la función a la columna del dataset
dataset_tweet["text_stopW"] = dataset_tweet["text_lower"].apply(remove_stopwords)

dataset_tweet


## 1.4 Contamos palabras que más se repiten

In [None]:
#Contamos las palabras más comunes utilizadas en los tweets, nos sirve para tener una idea de los temas más comunes en los tweets y para procesar nuevas palabras que no estén en el diccionario de stopwords
cnt = Counter()
for text in dataset_tweet["text_stopW"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(20)

In [38]:
#Cargar stopwords en español y agregar palabras personalizadas
STOPWORDS = set(stopwords.words('spanish'))
STOPWORDS.update(["user","mas", "hoy", "si", "mejor", "dia", "mundo"])  # Agrega palabras específicas

dataset_tweet["text_stopW"] = dataset_tweet["text_lower"].apply(remove_stopwords)


In [None]:
# Contar palabras más frecuentes
cnt = Counter()
for text in dataset_tweet["text_stopW"].values:
    for word in text.split():
        cnt[word] += 1

# Mostrar las 15 palabras más comunes
cnt.most_common(20)

## 1.5 Convertimos los emojis a texto

In [None]:
#Passing both functions to 'text_rare'
dataset_tweet['text_rare'] = dataset_tweet['text_stopW'].apply(convert_emoji_to_text)

dataset_tweet

In [None]:
#Se eliminan los emojis de los tweets


#def erase_emoji(emoji_text):
#    texto_sin_emojis = emoji.replace_emoji(emoji_text, replace='')
#    return texto_sin_emojis



# Passing both functions to 'text_rare'
#dataset_tweet['text_rare'] = dataset_tweet['text_stopW'].apply(erase_emoji)


## 1.6 World Embedding con TfidVectorizer

In [None]:
lista_descriptiva = dataset_tweet["text_rare"]
vectorizer = TfidfVectorizer(use_idf=False, min_df=.0005)
matrix = vectorizer.fit_transform(lista_descriptiva)
matrix_text = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names_out())
samples = matrix_text
samples

## 1.7 KMEANS sin PCA

In [None]:
#calculo kmeans del texto sin aplicar PCA para demostrar que si no aplicamos PCA para reducir dimensiones, el silhouette score es muy bajo debido a la maldición de la dimensionalidad.
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(samples)
    sample_km = km.labels_
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(samples, sample_km))

## 1.8 Reducción por PCA


In [None]:
#Aplicamos PCA a sam para reducir dimensiones a 2D
pca2D = PCA(2)
pca2D.fit(samples)
#Transform the data
sample_PCA_2D = pca2D.transform(samples)

df_samples_PCA_2D = pd.DataFrame(data=sample_PCA_2D, columns=["pca0", "pca1"])

title = 'Original data after 2D PCA transform'
axes_PCA_2D = {'x': 'pca0', 'y': 'pca1'}
Plot2D (sample_PCA_2D, title, axes_PCA_2D)

### 1.8.1 Modelación de Vecinos cercanos

In [None]:
PlotDistancesToKnearestNeighbor(sample_PCA_2D, 10)

### 1.8.2 Aplicamos DBSCAN a los datos

#### 1.8.2.1 Buscamos el epsilon optimo

In [None]:
#Buscamos el Epsilon óptimo para el DBSCAN y aplicamos el algoritmo
min_samples = 10
epsilon_values = [0.007, 0.0075, 0.0076, 0.0077, 0.0078, 0.0079, 0.008, 0.009, 0.021, 0.023, 0.024, 0.025,0.026]
# con user: epsilon_values = [0.020, 0.022, 0.024, 0.026, 0.028, 0.030, 0.032, 0.034, 0.036, 0.038, 0.040]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (sample_PCA_2D, epsilon, min_samples=min_samples)

In [None]:
# con user: epsilon = 0.034
epsilon = 0.0078 #602
labels_PCA_2D = aux_labels[epsilon]

# Crear un DataFrame con los tweets y las etiquetas de clúster
tweets_with_labels = pd.DataFrame({
    'tweet': lista_descriptiva,  # Cambia esto por tu columna de tweets procesados
    'label': labels_PCA_2D
})

# Filtrar para ignorar los puntos de ruido
clustered_tweets = tweets_with_labels[tweets_with_labels['label'] != -1]
clustered_tweets

#### 1.8.2.2 Representación del Resultado

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D PCA transformed data'
Plot2D_WithLabels (sample_PCA_2D, labels_PCA_2D, title, axes_PCA_2D)

#### 1.8.2.3 Obtener terminos más representativos de c/Cluster para verificar

In [None]:
# Obtener los términos más representativos
top_terms_per_cluster = get_top_terms_by_cluster(clustered_tweets)
top_terms_per_cluster

### 1.8.3 Aplicamos K-Means a los datos

#### 1.8.3.1 Buscamos la cantidad de clusters optima y aplicamos KMEANS

In [None]:
#calculo kmeans del texto con aplicar PCA
clusters =[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
silhouette_avg = []


for i in clusters:
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(sample_PCA_2D)
    sample_km_pca_2d = km.labels_
    
    score = silhouette_score(sample_PCA_2D, km.labels_)
    silhouette_avg.append(score)
    
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sample_PCA_2D, sample_km_pca_2d))

#3 clusters - 0.722

#### 1.8.3.2 Grafico KMEANS

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(sample_PCA_2D)


# Crear un DataFrame con los tweets y las etiquetas de clúster
tweets_with_labels = pd.DataFrame({
    'tweet': lista_descriptiva,  # Cambia esto por tu columna de tweets procesados
    'label': label
})

# Filtrar para ignorar los puntos de ruido
clustered_tweets = tweets_with_labels[tweets_with_labels['label'] != -1]
clustered_tweets

In [None]:
axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(sample_PCA_2D, label, title, axes_PCA , centroids_2D=kmeans.cluster_centers_)

#### 1.8.3.3 Obtener terminos más representativos de c/Cluster para verificar

In [None]:
# Obtener los términos más representativos
top_terms_per_cluster = get_top_terms_by_cluster(clustered_tweets)
top_terms_per_cluster

## 1.9 Reducción por t-SNE 2D

### 1.9.1 Creación del modelo

In [62]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(samples)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

### 1.9.2 Representación Visual del Modelo

In [None]:
title = 'Original data after 2D t-SNE transform'
axes_TSNE_2D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D (samples_TSNE_2D, title, axes_TSNE_2D)

### 1.9.3 Aplicamos K-Means t-SNE 2D a los datos

#### 1.9.3.1 Buscamos la cantidad de clusters optima y aplicamos KMEANS

In [None]:
#calculo kmeans del texto con aplicar PCA
clusters =[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
silhouette_avg = []


for i in clusters:
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(samples_TSNE_2D)
    sample_km_pca_2d = km.labels_
    
    score = silhouette_score(samples_TSNE_2D, km.labels_)
    silhouette_avg.append(score)
    
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(samples_TSNE_2D, sample_km_pca_2d))



#### 1.9.3.2 Graficamos KMeans

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(samples_TSNE_2D)

# Crear un DataFrame con los tweets y las etiquetas de clúster
tweets_with_labels = pd.DataFrame({
    'tweet': lista_descriptiva,  # Cambia esto por tu columna de tweets procesados
    'label': label
})

# Filtrar para ignorar los puntos de ruido
clustered_tweets = tweets_with_labels[tweets_with_labels['label'] != -1]
clustered_tweets

In [None]:

axes_tSNE = {'x': 'tsne0', 'y': 'tsne1'}
title = f'K-means(K={n_clusters}) over 2D T-SNE transformed data'
plot = Plot2D_WithLabels(samples_TSNE_2D, kmeans.labels_, title, axes_tSNE, centroids_2D=kmeans.cluster_centers_)

#### 1.9.3.3 Obtener terminos más representativos de c/Cluster para verificar

In [None]:
from collections import Counter

# Función para obtener los términos más representativos por clúster
def get_top_terms_by_cluster(clustered_tweets, top_n=5):
    cluster_topics = {}
    for cluster in clustered_tweets['label'].unique():
        # Filtrar los tweets del clúster actual
        cluster_tweets = clustered_tweets[clustered_tweets['label'] == cluster]['tweet']

        # Tokenizar palabras
        all_words = " ".join(cluster_tweets).split()

        # Contar frecuencia de palabras
        word_counts = Counter(all_words)

        # Obtener las 'top_n' palabras más frecuentes
        cluster_topics[cluster] = word_counts.most_common(top_n)
    return cluster_topics

# Obtener los términos más representativos
top_terms_per_cluster = get_top_terms_by_cluster(clustered_tweets)
top_terms_per_cluster

### 1.9.4 Modelación KnearestNeighbor TSNE 2D

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 15)

### 1.9.5 Aplicar DBScan

#### 1.9.5.1 Encontrar Epsilon óptimo

In [None]:
min_samples = 20
epsilon_values = [3, 3.2, 3.4, 3.6, 3.8, 3.9, 4, 4.05, 4.055,4.1, 4.3, 4.5, 4.7, 4.9, 5.0, 5.1]
aux_labels = {}

for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

In [None]:
#epsilon = 3.4
epsilon = 4.9
labels_TSNE_2D = aux_labels[epsilon]


# Crear un DataFrame con los tweets y las etiquetas de clúster
tweets_with_labels = pd.DataFrame({
    'tweet': lista_descriptiva,  # Cambia esto por tu columna de tweets procesados
    'label': labels_TSNE_2D
})

# Filtrar para ignorar los puntos de ruido
clustered_tweets = tweets_with_labels[tweets_with_labels['label'] != -1]
clustered_tweets

#### 1.9.5.2 Representación grafica del resultado

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

#### 1.9.5.3 Obtener terminos más representativos de c/Cluster para verificar

In [None]:
from collections import Counter

# Función para obtener los términos más representativos por clúster
def get_top_terms_by_cluster(clustered_tweets, top_n=5):
    cluster_topics = {}
    for cluster in clustered_tweets['label'].unique():
        # Filtrar los tweets del clúster actual
        cluster_tweets = clustered_tweets[clustered_tweets['label'] == cluster]['tweet']

        # Tokenizar palabras
        all_words = " ".join(cluster_tweets).split()

        # Contar frecuencia de palabras
        word_counts = Counter(all_words)

        # Obtener las 'top_n' palabras más frecuentes
        cluster_topics[cluster] = word_counts.most_common(top_n)
    return cluster_topics

# Obtener los términos más representativos
top_terms_per_cluster = get_top_terms_by_cluster(clustered_tweets)
top_terms_per_cluster

# 2. Embeding BETO

## 2.1 Carga de Datos

In [73]:
embeddings_beto = np.load("EmoEvalEs-embeddings-BETO.npy")



## 2.2 Sin reducción de dimensionalidad

### 2.2.1 Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(embeddings_beto, 10)

### 2.2.2 DBScan sin Reducción de Dimensionalidad

In [None]:
min_samples = 10
epsilon_values = [4,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,5,6]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData(embeddings_beto, epsilon, min_samples=min_samples)

### 2.2.3 K-Means sin Reducción de Dimensionalidad

In [None]:
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(embeddings_beto)
    sample_km = km.labels_
    score = metrics.silhouette_score(embeddings_beto, sample_km)
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % score)

## 2.3 Con reducción de dimensionalidad

### 2.3.1 K-Means PCA 2D

#### 2.3.1.1 Búsqueda N° óptimo CLusters

In [None]:
# Raw Data
pca2D = PCA(2)
 
#Transform the data
df_2D = pca2D.fit_transform(embeddings_beto)

df_samples_PCA_2D = pd.DataFrame(data=df_2D, columns=["pca0", "pca1"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### 2.3.1.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.1.3 Modelado K-Means con N° Óptimo Clusters 2D

In [88]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_2D)

In [None]:

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(df_samples_PCA_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

### 2.3.2 K-Means PCA 3D

#### 2.3.2.1 Búsqueda N° óptimo Clusters

In [None]:
pca3D = PCA(n_components=3)
df_3D = pca3D.fit_transform(embeddings_beto)
df_samples_PCA_3D = pd.DataFrame(data=df_3D, columns=["pca0", "pca1", "pca2"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### 2.3.2.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.2.3 Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_3D)
axes_PCA_3D = {'x': 'pca0', 'y': 'pca1', 'z': 'pca2'}

title = f'K-means(K={n_clusters}) over 3D PCA transformed data'
plot = Plot3D_WithLabels (df_samples_PCA_3D, kmeans.labels_, title, axes_PCA_3D, centroids_3D=kmeans.cluster_centers_)

### 2.3.3 DBScan t-SNE 2D

#### 2.3.3.1 Reducción de Dimensión

In [96]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(embeddings_beto)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

#### 2.3.3.2 Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 50) # notese que el parametro de entrada NO es samples

#### 2.3.3.3 Epsilons Score Silhouette 

In [None]:
min_samples = 50
epsilon_values = [4, 4.05, 4.1, 4.2, 4.3, 4.35, 4.4, 4.5, 4.6, 4.7, 4.9, 5, 5.2, 5.4, 5.6,6,6.2]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

In [122]:
epsilon = 4
labels_TSNE_2D = aux_labels[epsilon]

#### 2.3.3.4 Visualización DBScan t-SNE 2D

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

### 2.3.4 DBScan t-SNE 3D

#### 2.3.4.1 Reducción de Dimensión

In [107]:
tsne3D = TSNE(n_components=3)
samples_TSNE_3D = tsne3D.fit_transform(embeddings_beto)
df_samples_TSNE_3D = pd.DataFrame(data=samples_TSNE_3D, columns=["tsne0", "tsne1", "tsne2"])

#### 2.3.4.2 Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_3D, 100)

#### 2.3.4.3 DBScan t-SNE 3D

##### 2.3.4.3.1 Epsilons Score Silhouette

In [None]:
min_samples = 100
epsilon_values = [5.1,5.2,5.25,5.3, 5.35, 5.4, 5.5, 5.6]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_3D, epsilon, min_samples=min_samples)

In [110]:
epsilon = 5.4
labels_TSNE_3D = aux_labels[epsilon]

##### 2.3.4.3.2 Visualización DBScan t-SNE 3D

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 3D t-SNE transformed data'
axes_TSNE_3D = {'x': 'tsne0', 'y': 'tsne1' , 'z': 'tsne2'}
Plot3D_WithLabels (samples_TSNE_3D, labels_TSNE_3D, title, axes_TSNE_3D)

### 2.3.5 DBScan - PCA 2D

In [None]:
n_samples = 40
PlotDistancesToKnearestNeighbor(df_samples_PCA_2D, n_samples)

In [None]:
epsis = [0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4,0.42,0.44,0.46,0.47,0.48,0.49,0.5]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_2D, epsilon, min_samples=min_samples)

In [None]:
epsilon = 0.48
labels_PCA_2D = aux_labels[epsilon]
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
Plot2D_WithLabels (df_samples_PCA_2D, labels_PCA_2D, title, axes_PCA_2D, palette="Paired")

### 2.3.6 DBScan - PCA 3D

In [None]:
n_samples = 20
PlotDistancesToKnearestNeighbor(df_samples_PCA_3D, n_samples)

In [None]:
epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48]
data_epsis = []
min_samples = n_samples

#epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_3D, epsilon, min_samples=min_samples)

In [None]:
epsilon = 0.42
labels_PCA_3D = aux_labels[epsilon]
title = f'DBScan(K={n_clusters}, eps={epsilon}) over 3D PCA transformed data'
Plot3D_WithLabels (df_samples_PCA_3D, labels_PCA_3D, title, axes_PCA_3D)

### 2.3.7 K-Means t-SNE 3D

#### 2.3.7.1 Búsqueda N° óptimo Clusters

In [None]:
tsne3D = TSNE(n_components=3)
df_3D = tsne3D.fit_transform(embeddings_beto)
df_samples_tsne_3D = pd.DataFrame(data=df_3D, columns=["tsne0", "tsne1", "tsne2"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_tsne_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_tsne_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### 2.3.7.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.2.3 Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_tsne_3D)
axes_tsne_3D = {'x': 'tsne0', 'y': 'tsne1', 'z': 'tsne2'}

title = f'K-means(K={n_clusters}) over 3D t-SNE transformed data'
plot = Plot3D_WithLabels (df_samples_tsne_3D, kmeans.labels_, title, axes_tsne_3D, centroids_3D=kmeans.cluster_centers_)

### 2.3.8 K-Means t-SNE 2D

#### 2.3.8.1 Búsqueda N° óptimo CLusters

In [None]:
tsne2D = TSNE(n_components=2)
df_2D = tsne2D.fit_transform(embeddings_beto)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_TSNE_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_TSNE_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### 2.3.8.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.8.3 Modelado K-Means con N° Óptimo Clusters 2D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_TSNE_2D)

axes_PCA = {'x': 'tsne0', 'y': 'tsne1'}
title = f'K-means(K={n_clusters}) over 2D tSNE transformed data'
plot = Plot2D_WithLabels(df_samples_TSNE_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

# 3 Embedding MARIA

## 3.1 Carga de Datos

In [137]:
embeddings_maria = np.load("EmoEvalEs-embeddings-MARIA.npy")


## 3.2 Sin reducción de dimensionalidad

### 3.2.1 Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(embeddings_maria, 10)

### 3.2.2 DBScan sin Reducción de Dimensionalidad

In [None]:
min_samples = 10
epsilon_values = [0.8,1,1.2,1.3,1.4,1.5,1.6,1.7,1.8]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData(embeddings_maria, epsilon, min_samples=min_samples)

In [None]:
epsilon = 1.8
labels = aux_labels[epsilon]

### 3.2.3 K-Means sin PCA

In [None]:
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(embeddings_maria)
    sample_km = km.labels_
    score = metrics.silhouette_score(embeddings_maria, sample_km)
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % score)

## 3.3 Con reducción de dimensionalidad

### 3.3.1 K-Means PCA 2D

#### 3.3.1.1 Búsqueda N° óptimo CLusters

In [None]:
# Raw Data
pca2D = PCA(2)
 
#Transform the data
df_2D = pca2D.fit_transform(embeddings_maria)

df_samples_PCA_2D = pd.DataFrame(data=df_2D, columns=["pca0", "pca1"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### 3.3.1.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 3.3.1.3 Modelado K-Means con N° Óptimo Clusters 2D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_2D)

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(df_samples_PCA_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

### 3.3.2 K-Means PCA 3D

#### 3.3.2.1 Búsqueda N° óptimo Clusters

In [None]:
pca3D = PCA(n_components=3)
df_3D = pca3D.fit_transform(embeddings_maria)
df_samples_PCA_3D = pd.DataFrame(data=df_3D, columns=["pca0", "pca1", "pca2"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### 3.3.2.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 3.3.2.3 Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_3D)
axes_PCA_3D = {'x': 'pca0', 'y': 'pca1', 'z': 'pca2'}

title = f'K-means(K={n_clusters}) over 3D PCA transformed data'
plot = Plot3D_WithLabels (df_samples_PCA_3D, kmeans.labels_, title, axes_PCA_3D, centroids_3D=kmeans.cluster_centers_)

### 3.3.3 t-SNE 2D

#### 3.3.3.1 Reducción de Dimensión

In [147]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(embeddings_maria)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

#### 3.3.3.2 Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 50) # notese que el parametro de entrada NO es samples

#### 3.3.3.3 DBScan t-SNE 2D

##### 3.3.3.3.1 Epsilons Score Silhouette 

In [None]:
min_samples = 50
epsilon_values = [4.6, 4.7,4.8, 4.9, 4.95, 5, 5.05, 5.1, 5.2, 5.4, 5.6]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

In [150]:
epsilon = 4.95
labels_TSNE_2D = aux_labels[epsilon]

##### 3.3.3.3.2 Visualización DBScan t-SNE 2D

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

### 3.3.4 t-SNE 3D

#### 3.3.4.1 Reducción de Dimensión

In [152]:
tsne3D = TSNE(n_components=3)
samples_TSNE_3D = tsne3D.fit_transform(embeddings_maria)
df_samples_TSNE_3D = pd.DataFrame(data=samples_TSNE_3D, columns=["tsne0", "tsne1", "tsne2"])

#### 3.3.4.2 Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_3D, 100)

#### 2.3.4.3 DBScan t-SNE 3D

##### 2.3.4.3.1 Epsilons Score Silhouette

In [None]:
min_samples = 100
epsilon_values = [4,4.1,4.2,4.3,4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5, 5.1,5.2]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_3D, epsilon, min_samples=min_samples)

In [155]:
epsilon = 4.4
labels_TSNE_3D = aux_labels[epsilon]

##### 3.3.4.3.2 Visualización DBScan t-SNE 3D

In [None]:
title = f'DBScan(eps={epsilon}) over 3D t-SNE transformed data'
axes_TSNE_3D = {'x': 'tsne0', 'y': 'tsne1' , 'z': 'tsne2'}
Plot3D_WithLabels (samples_TSNE_3D, labels_TSNE_3D, title, axes_TSNE_3D)

### 3.3.5 DBScan - PCA 2D

In [None]:
n_samples = 20
PlotDistancesToKnearestNeighbor(df_samples_PCA_2D, n_samples)

In [None]:
min_samples = 20
epsis = [0.025, 0.03,0.04, 0.05,0.06,0.07,0.08,0.09, 0.091, 0.092, 0.093, 0.097, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_2D, epsilon, min_samples=min_samples)

In [None]:
epsilon = 0.13
labels_PCA_2D = aux_labels[epsilon]
title = f'DBScan(K={n_clusters}) over 2D PCA transformed data'
Plot2D_WithLabels (df_samples_PCA_2D, labels_PCA_2D, title, axes_PCA_2D)

### 2.3.6 DBScan - PCA 3D

In [None]:
n_samples = 20
PlotDistancesToKnearestNeighbor(df_samples_PCA_3D, n_samples)

In [None]:
epsis = [0.07,0.09,0.0945,0.1,0.15,0.16,0.17,0.2]
data_epsis = []
min_samples = n_samples

#epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_3D, epsilon, min_samples=min_samples)

In [None]:
epsilon = 0.2
labels_PCA_3D = aux_labels[epsilon]
title = f'K-means(K={n_clusters}) over 3D PCA transformed data'
Plot3D_WithLabels (df_samples_PCA_3D, labels_PCA_3D, title, axes_PCA_3D)

### 2.3.7 K-Means t-SNE 3D

#### 2.3.7.1 Búsqueda N° óptimo Clusters

In [None]:
tsne3D = TSNE(n_components=3)
df_3D = tsne3D.fit_transform(embeddings_maria)
df_samples_tsne_3D = pd.DataFrame(data=df_3D, columns=["tsne0", "tsne1", "tsne2"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_tsne_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_tsne_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### 3.3.7.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.2.3 Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_tsne_3D)
axes_tsne_3D = {'x': 'tsne0', 'y': 'tsne1', 'z': 'tsne2'}

title = f'K-means(K={n_clusters}) over 3D t-SNE transformed data'
plot = Plot3D_WithLabels (df_samples_tsne_3D, kmeans.labels_, title, axes_tsne_3D, centroids_3D=kmeans.cluster_centers_)

### 2.3.8 K-Means t-SNE 2D

#### 2.3.8.1 Búsqueda N° óptimo CLusters

In [None]:
tsne2D = TSNE(n_components=2)
df_2D = tsne2D.fit_transform(embeddings_maria)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

clusters = [5,6,7,8,9,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_TSNE_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_TSNE_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### 2.3.8.2 Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### 2.3.8.3 Modelado K-Means con N° Óptimo Clusters 2D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_TSNE_2D)

axes_PCA = {'x': 'tsne0', 'y': 'tsne1'}
title = f'K-means(K={n_clusters}) over 2D tSNE transformed data'
plot = Plot2D_WithLabels(df_samples_TSNE_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)