# Librerias Utilizadas

In [1]:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.patches as mpatches
import statistics
import emoji

from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from nltk.tokenize import word_tokenize
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize
from emot.emo_unicode import UNICODE_EMOJI
from emot.emo_unicode import EMOTICONS_EMO

from collections import Counter
from itertools import groupby
from mpl_toolkits.mplot3d import Axes3D
from scipy import stats

# Funciones Utilizadas

In [2]:
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def Plot2D (samples_2D, title, axes = {'x': 'x', 'y': 'y'}):

  """
  samples_2D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 2) ; 2 por las dos coordenadas

  title(string): titulo de la figura que se utiliza para el plot
  """

  df_samples_2D = pd.DataFrame(data=samples_2D, columns=[axes['x'], axes['y']])

  sns.set(font_scale=3)
  sns.set(rc={'figure.figsize':(10,10)})
  sns.relplot(data=df_samples_2D,
              x=axes['x'],
              y=axes['y'],
              height=10, legend="full", palette="bright")

  set_size_letters(title,
                   axes['x'],
                   axes['y'],
                   active_legend = False)
  plt.axis('equal')

def Plot2D_WithLabels (samples_2D, labels, title, axes = {'x': 'x', 'y': 'y'}, palette="bright", centroids_2D=None):

  """
  samples_2D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 2) ; 2 por las dos coordenadas

  labels (array): etiqueta perteneciente a cada muestra.

  title(string): titulo de la figura que se utiliza para el plot
  """

  # First we create the dataframe
  df_samples_2D_labeled = pd.DataFrame(data=samples_2D, columns=[axes['x'], axes['y']])

  # Then we add the labels column
  df_samples_2D_labeled['label'] = labels.tolist()

  if centroids_2D is not None:
    labels_centroid_aux = np.arange(len(centroids_2D), dtype=int)
    df_centroids_2D_labeled = pd.DataFrame(data=centroids_2D, columns=[axes['x'], axes['y']])
    df_centroids_2D_labeled['label'] = labels_centroid_aux.tolist()

  sns.set(font_scale=3)
  sns.set(rc={'figure.figsize':(10,10)})
  sns.relplot(data=df_samples_2D_labeled,
              x=axes['x'],
              y=axes['y'],
              hue="label", height=10, legend="full", palette=palette)

  if centroids_2D is not None:
    # First we plot the centroids
    sns.scatterplot(data=df_centroids_2D_labeled,
              x=axes['x'],
              y=axes['y'],
              hue="label",
              legend=False, palette=palette, s=100)

    # Then we plot their labels
    plot_2D_centroid_labels(df_centroids_2D_labeled, plt.gca())

  set_size_letters(title,
                   axes['x'],
                   axes['y'],
                   active_legend = False)
  plt.axis('equal')

def plot_2D_centroid_labels(df_centroids, ax):
    for index, row in df_centroids.iterrows():
      ax.text(row[0]+.5, row[1], 'C'+str(row[2].astype(int)), fontsize = 20, color='black', weight='semibold')
def set_size_letters(title, x_name, y_name, title_size = 20, x_size = 18, y_size = 18, active_legend = True, legend_size = 14):

  """
  Parameters:
    title (string): titulo del plot a representar

    x_name (string): nombre del eje x

    y_name (string): nombre del eje y

    active_legend (bool): indica si mostramos la leyenda o no. Por defecto True

    x_size, y_size, legend_size: tamaño de fuente de eje x, y, leyenda

  """

  plt.title(title, fontsize=title_size)
  plt.xlabel(x_name, fontsize=x_size)
  plt.ylabel(y_name, fontsize=y_size)
  if (active_legend == True):
    plt.legend(fontsize=legend_size)

def Plot3D_WithLabels (samples_3D, labels, title, axes = {'x': 'x', 'y': 'y', 'z': 'z'}, centroids_3D=None):

  """
  samples_3D (numpy.ndarray): array tipo embedding cuyo
        shape es (num_muestras, 3) ; 3 por las tres coordenadas

  labels (array): etiqueta perteneciente a cada muestra.

  title(string): titulo de la figura que se utiliza para el plot
  """

  # First we create the dataframe
  df_samples_3D_labeled = pd.DataFrame(data=samples_3D, columns=[axes['x'], axes['y'], axes['z']])

  # Then we add the labels column
  df_samples_3D_labeled['label'] = labels.tolist()
  df_samples_3D_labeled['label'] = df_samples_3D_labeled["label"].astype(str)

  if centroids_3D is not None:
    # Same applies for centroids when these are provided as an argument
    labels_centroid_aux = np.arange(len(centroids_3D), dtype=int)
    df_centroids_3D_labeled = pd.DataFrame(data=centroids_3D, columns=[axes['x'], axes['y'], axes['z']])
    df_centroids_3D_labeled['label'] = labels_centroid_aux.tolist()

    # We create an additional column with the dot size used for each type of sample
    size_no_centroid = np.ones(len(samples_3D)) * 10 # for regular samples
    size_centroid = np.ones(len(centroids_3D)) * 50 # for centroids
    size_col = np.append(size_no_centroid, size_centroid) # new col to be added to the dataframe

    # We also create another additional column with the labels for each type of sample
    no_es_centroide_aux = [' '] * len(samples_3D) # empty label for regular samples
    es_centroide_aux = []
    for i in range(len(centroids_3D)):
      es_centroide_aux.append('C%d' % i) # Ci label for centroid i
    centroid_col = no_es_centroide_aux + es_centroide_aux # new col to be added to the dataframe

    # Next we concatenate both dataframes: first, regular samples, then, centroids
    df_samples_and_centroids = pd.concat([df_samples_3D_labeled, df_centroids_3D_labeled], ignore_index=True)
      

    # We add the new column with the labels distinguishing regular samples from centroids
    df_samples_and_centroids['centroid'] = centroid_col

    # New column is re-casted as a string column
    df_samples_and_centroids['centroid'] = df_samples_and_centroids['centroid'].astype(str)

    # We add the new column with the corresponding size for both regular samples and centroids
    df_samples_and_centroids['size'] = size_col

    # We ensure that the 'label' column is numeric since we will sort the dataframe upon this one
    df_samples_and_centroids['label'] = pd.to_numeric(df_samples_and_centroids['label'])

    # We finally sort the dataframe by the 'label' column in ascending order
    df_samples_and_centroids_sorted = df_samples_and_centroids.sort_values(by=['label'], ascending=True)

    # And plot both the samples and their corresponding centroids
    fig = px.scatter_3d(df_samples_and_centroids_sorted, x=axes['x'], y=axes['y'], z=axes['z'], text='centroid', size='size', color='label')
  else:
    # We ensure that the 'label' column is numeric since we will sort the dataframe upon this one
    df_samples_3D_labeled['label'] = pd.to_numeric(df_samples_3D_labeled['label'])

    # We finally sort the dataframe by the 'label' column in ascending order
    df_samples_and_centroids_sorted = df_samples_3D_labeled.sort_values(by=['label'], ascending=True)

    fig = px.scatter_3d(df_samples_and_centroids_sorted, x=axes['x'], y=axes['y'], z=axes['z'], color='label', size=np.ones(len(samples_3D))) #, color_continuous_scale='delta')

  fig.update_traces(textposition='top center')
  fig.update_layout(scene_aspectmode='data')
  fig.update_layout(uniformtext_minsize=60)

  fig.update_layout(title_font_size=20,
                    title={
                    'text': title,
                    'y': 0.9,
                    'x': 0.05,
                    'xanchor': 'left',
                    'yanchor': 'top'})
  # tight layout
  fig.update_layout(autosize = True, margin = dict(l=50, r=0, b=10, t=30))
  fig.show()

def ApplyDBScanToData (samples, epsilon, min_samples = 30):

  """
  Parameters:

    samples (numpy.ndarray): array tipo embedding cuyo
        shape es (n_ejemplo,n_muestras_por_ejemplo)

    epsilon: int con el valor de epsilon (distancia mínima entre elementos
        para formar un cluster)

    min_samples (int): numero minimo de muestras para generar un cluster

    include_noise (bool): permite seleccionar si se quiere incluir el cluster
        de ruido en el cálculo de la silueta o no. Por defecto no se incluye.
        Se debe incluir cuando únicamente se detecta un cluster, para poder
        realizar la evaluación.

  Return:

    labels: lista con el cluster al que pertenece cada ejemplo de data. Cluster
        -1 significa ruido.
  """


  db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(samples)

  core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  core_samples_mask[db.core_sample_indices_] = True
  labels = db.labels_

  # Number of clusters in labels, ignoring noise if present.
  n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  n_noise_ = list(labels).count(-1)

  print(f"\nTest for epsilon = {epsilon}")
  print('Estimated number of clusters: %d' % n_clusters_)
  print('Estimated number of noise points: %d' % n_noise_)
  print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(samples, labels))


  return labels

def CreateFeatureVectors(data):
  num_nodes = 63
  num_rows, num_cols = data.shape # shape: devuelve las dimensiones de nuestro array (matriz en este caso) numpy
  num_measures_per_node = num_cols # 3 (i.e. temperatura, humedad y presión)
  num_samples = int(num_rows / num_nodes)

  samples = np.zeros((num_samples, num_nodes*num_cols)) # para crear una nueva matriz "samples" rellena de ceros y de dimensiones (4070, 189)

  # recordemos que "data" es una matriz (256410, 3)
  n_row = 0 # hasta num_samples
  n_col = 0 # hasta num_nodes*num_cols = 189

  for i in range(num_rows): # recorremos todas las filas de la matriz original (256410, 3)
    for elem in data[i]: # añadimos a la fila los valores de cada nodo de 3 en 3 hasta agotar los del timestamp específico
      if (n_col == num_nodes*num_measures_per_node): # si llenamos la fila, pasamos a la siguiente (siguiente timestamp)
        n_col = 0
        n_row += 1 # siguiente fila
      samples[n_row, n_col] = elem
      n_col += 1

  return samples

def PlotDistancesToKnearestNeighbor(data_vector, K):

  """
  Parameters:
    data_vector (numpy.ndarray): array tipo embedding cuyo
        shape es (n_ejemplo,n_muestras_por_ejemplo)

    k: posición del k-esimo vecino más cercano

  """

  nbrs = NearestNeighbors(n_neighbors=K).fit(data_vector)
  distances, indices = nbrs.kneighbors(data_vector)
  distances = np.sort(distances, axis=0)
  distances = distances[:,K-1]
  plt.figure(figsize=(10,8))
  set_size_letters(f"Distancias al K-vecino más cercano (K={K})",
                   f"Points sorted according to distance of the {K}-th nearest neighbor",
                   f"{K}-th nearest neighbor distance")
  plt.plot(distances)

def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"\b[0-9]+\b\s*", "", text)
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)  # Remove punctuation
    text = " ".join(text.split())  # Remove extra spaces, tabs, and new lines
    return text

# Texto Tweets

In [None]:
dataset1 = pd.read_csv("APAUtweets.txt", header=None, delimiter = "\t", encoding = 'utf8')
dataset1.set_index(0, inplace = True)
dataset1.rename(columns={1: "text"}, inplace = True)
dataset1

## Preprocesado

### Cambio a minusculas

In [None]:
dataset1["text_lower"] = dataset1["text"].map(preprocess_text)
dataset1

### Borramos palabras tipicas

In [5]:
#Con esto sacamos palabras tipicas de habla hispana.
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('spanish'))

In [None]:
def stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

dataset1["text_stop"] = dataset1["text_lower"].apply(stopwords)
dataset1.head()

### Contamos las más comunes

In [None]:
#Contamos palabras más populares

cnt = Counter()
for text in dataset1["text_stop"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

### Transformamos los emojis

In [8]:
def convert_emoji_to_text(emoji_text):
    text_with_aliases = emoji.demojize(emoji_text)
    return text_with_aliases


# Passing both functions to 'text_rare'
dataset1['text_rare'] = dataset1['text_stop'].apply(convert_emoji_to_text)

In [None]:
dataset1.head()

### Embedding con TfidVectorizer

In [None]:
#Lo transformamos a un wordembedding

lista_descriptiva = dataset1["text_rare"]
vec = TfidfVectorizer(use_idf=False, min_df=.0005)
matrix = vec.fit_transform(lista_descriptiva)
matrix_text = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names_out())
samples = matrix_text
samples

### KMeans sin PCA

In [None]:
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(samples)
    sample_km = km.labels_
    score = metrics.silhouette_score(samples, sample_km)
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % score)

### Aplicación PCA

#### Creación del modelo

In [12]:
# Aplico pca
pca2D = PCA(2)
pca2D.fit(samples)
#Transform the data
sample_PCA_2D = pca2D.transform(samples)

df_samples_PCA_2D = pd.DataFrame(data=sample_PCA_2D, columns=["pca0", "pca1"])

#### Representación del modelo

In [None]:
title = 'Original data after 2D PCA transform'
axes_PCA_2D = {'x': 'pca0', 'y': 'pca1'}
Plot2D (sample_PCA_2D, title, axes_PCA_2D)

#### Modelación vecinos cercanos

In [None]:
PlotDistancesToKnearestNeighbor(sample_PCA_2D, 10)

#### Aplicamos DBScan a los datos

In [None]:
min_samples = 10
epsilon_values = [0.028, 0.03, 0.032, 0.035 , 0.037, 0.04, 0.044, 0.046, 0.05, 0.06]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (sample_PCA_2D, epsilon, min_samples=min_samples)

epsilon = 0.046
labels_PCA_2D = aux_labels[epsilon]

#### Representación del resultado

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (sample_PCA_2D, labels_PCA_2D, title, axes_PCA_2D)

#### Aplicación KMeans

In [None]:
#calculo kmeans del texto con aplicar PCA
clusters =[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
silhouette_avg = []


for i in clusters:
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(sample_PCA_2D)
    sample_km_pca_2d = km.labels_
    
    score = silhouette_score(sample_PCA_2D, km.labels_)
    silhouette_avg.append(score)
    
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(sample_PCA_2D, sample_km_pca_2d))

#### Gráfico K-Means 

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(sample_PCA_2D)

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(sample_PCA_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

### Aplicación TSNE 2D

#### Creación del modelo

In [19]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(samples)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

#### Representación Visual

In [None]:
title = 'Original data after 2D t-SNE transform'
axes_TSNE_2D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D (samples_TSNE_2D, title, axes_TSNE_2D)

#### Modelación KnearestNeighbor TSNE 2D

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 15)

#### Aplicación KMeans

In [None]:
#calculo kmeans del texto con aplicar PCA
clusters =[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
silhouette_avg = []


for i in clusters:
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(samples_TSNE_2D)
    sample_km_pca_2d = km.labels_
    
    score = silhouette_score(samples_TSNE_2D, km.labels_)
    silhouette_avg.append(score)
    
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(samples_TSNE_2D, sample_km_pca_2d))

#### Gráfico K-Means

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(samples_TSNE_2D)

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(samples_TSNE_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

#### Aplicación DBScan

In [None]:
min_samples = 20
epsilon_values = [3.2, 3.4, 3.6, 3.8, 3.9, 4, 4.05, 4.055,4.1, 4.3, 4.5, 4.7, 4.9]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

epsilon = 3.4
labels_TSNE_2D = aux_labels[epsilon]

#### Representación del resultado

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

# Embedding BETO - MARIA

## Dataset BETO

### Cargado de Datos

In [None]:
dataset2 = np.loadtxt("EmoEvalEs-embeddings-BETO.txt")
dataset2.shape

### Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(dataset2, 10)

### DBScan sin Reducción de Dimensionalidad

In [None]:
min_samples = 10
epsilon_values = [4,4.1,4.2,4.3,4.4,4.5,4.6,4.7,4.8,5,6]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData(dataset2, epsilon, min_samples=min_samples)

epsilon = 4.3
labels = aux_labels[epsilon]

### K-Means sin PCA

In [None]:
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(dataset2)
    sample_km = km.labels_
    score = metrics.silhouette_score(dataset2, sample_km)
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % score)

### PCA 2D

#### Búsqueda N° óptimo CLusters

In [None]:
# Raw Data
pca2D = PCA(2)
 
#Transform the data
df_2D = pca2D.fit_transform(dataset2)

df_samples_PCA_2D = pd.DataFrame(data=df_2D, columns=["pca0", "pca1"])

clusters = [5,6,7,8,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### Modelado K-Means con N° Óptimo Clusters 2D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_2D)

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(df_samples_PCA_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

### PCA 3D

#### Búsqueda N° óptimo Clusters

In [None]:
pca3D = PCA(n_components=3)
df_3D = pca3D.fit_transform(dataset2)
df_samples_PCA_3D = pd.DataFrame(data=df_3D, columns=["pca0", "pca1", "pca2"])

clusters = [5,6,7,8,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_3D)
axes_PCA_3D = {'x': 'pca0', 'y': 'pca1', 'z': 'pca2'}

title = f'K-means(K={n_clusters}) over 3D PCA transformed data'
plot = Plot3D_WithLabels (df_samples_PCA_3D, kmeans.labels_, title, axes_PCA_3D, centroids_3D=kmeans.cluster_centers_)

### t-SNE 2D

#### Reducción de Dimensión

In [36]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(dataset2)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

#### Visualización t-SNE 2D

In [None]:
title = 'Original data after 2D t-SNE transform'
axes_TSNE_2D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D (samples_TSNE_2D, title, axes_TSNE_2D)

#### DBScan t-SNE 2D

##### Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 50) # notese que el parametro de entrada NO es samples

##### Epsilons Score Silhouette 

In [None]:
min_samples = 50
epsilon_values = [3.9, 4, 4.05, 4.055,4.1, 4.3, 4.5, 4.7, 4.9, 5, 5.2, 5.4, 5.6, 5.8]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

epsilon = 4.05
labels_TSNE_2D = aux_labels[epsilon]

##### Visualización DBScan

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

### t-SNE 3D

#### Reducción de Dimensión

In [41]:
tsne3D = TSNE(n_components=3)
samples_TSNE_3D = tsne2D.fit_transform(dataset2)
df_samples_TSNE_3D = pd.DataFrame(data=samples_TSNE_3D, columns=["tsne0", "tsne1"])

#### Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_3D, 100)

### DBScan t-SNE 3D

#### Epsilons Score Silhouette

In [None]:
min_samples = 100
epsilon_values = [4.5, 5, 5.5, 5.7, 5.9, 5.95, 6, 6.1, 6.2, 6.3, 6.5, 6.6,6.7, 6.9, 7, 7.5]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_3D, epsilon, min_samples=min_samples)

epsilon = 6
labels_TSNE_3D = aux_labels[epsilon]

#### Visualización t-SNE 3D

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
title = 'Original data after 3D t-SNE transform'
axes_TSNE_3D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D_WithLabels (samples_TSNE_3D, labels_TSNE_3D, title, axes_TSNE_3D)

### DBScan

#### DBScan - PCA 2D

In [None]:
n_samples = 40
PlotDistancesToKnearestNeighbor(df_samples_PCA_2D, n_samples)

In [None]:
epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_2D, epsilon, min_samples=min_samples)

epsilon = 0.29
labels_PCA_2D = aux_labels[epsilon]
Plot2D_WithLabels (df_samples_PCA_2D, labels_PCA_2D, title, axes_PCA_2D, palette="Paired")

#### DBScan - PCA 3D

In [None]:
n_samples = 20
PlotDistancesToKnearestNeighbor(df_samples_PCA_3D, n_samples)

In [None]:
epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4,0.41,0.42,0.43,0.44,0.45,0.46,0.47,0.48]
data_epsis = []
min_samples = n_samples

#epsis = [0.2,0.28,0.29,0.3,0.31,0.32,0.33,0.34,0.35,0.37,0.39,0.4]
aux_labels = {}
for epsilon in epsis:
  aux_labels[epsilon] = ApplyDBScanToData (df_samples_PCA_3D, epsilon, min_samples=min_samples)

epsilon = 0.42
labels_PCA_3D = aux_labels[epsilon]
Plot2D_WithLabels (df_samples_PCA_3D, labels_PCA_3D, title, axes_PCA_3D, palette="Paired")

## Dataset MARIA

### Cargado de Datos

In [None]:
dataset3 = np.loadtxt("EmoEvalEs-embeddings-MARIA.txt")
dataset3.shape

### Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(dataset3, 10)

### DBScan sin Reducción de Dimensionalidad

In [None]:
min_samples = 7
epsilon_values = [1,1.2,1.3,1.4,1.5,1.6,1.7,1.8]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData(dataset3, epsilon, min_samples=min_samples)

epsilon = 1.4
labels = aux_labels[epsilon]

### K-Means sin PCA

In [None]:
for i in range(2,20):
    km = KMeans(n_clusters=i, init='random', max_iter=200, random_state=0).fit(dataset3)
    sample_km = km.labels_
    score = metrics.silhouette_score(dataset3, sample_km)
    print("cantidad de cluster:", i)
    print("Silhouette Coefficient: %0.3f" % score)

### PCA 2D

#### Búsqueda N° óptimo CLusters

In [None]:
# Raw Data
pca2D = PCA(2)
 
#Transform the data
df_2D = pca2D.fit_transform(dataset3)

df_samples_PCA_2D = pd.DataFrame(data=df_2D, columns=["pca0", "pca1"])

clusters = [5,6,7,8,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_2D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_2D[label == i , 0] , df_2D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_2D, kmeans.labels_)
    silhouette_avg.append(score)

#### Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### Modelado K-Means con N° Óptimo Clusters 2D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_2D)

axes_PCA = {'x': 'pca0', 'y': 'pca1'}
title = f'K-means(K={n_clusters}) over 2D PCA transformed data'
plot = Plot2D_WithLabels(df_samples_PCA_2D, kmeans.labels_, title, axes_PCA, centroids_2D=kmeans.cluster_centers_)

### PCA 3D

#### Búsqueda N° óptimo CLusters

In [None]:
pca3D = PCA(n_components=3)
df_3D = pca3D.fit_transform(dataset3)
df_samples_PCA_3D = pd.DataFrame(data=df_3D, columns=["pca0", "pca1", "pca2"])

clusters = [5,6,7,8,10,15]
silhouette_avg = []

for i in clusters:
    #Initialize the class object
    kmeans = KMeans(n_clusters=i)
     
    #predict the labels of clusters.
    label = kmeans.fit_predict(df_samples_PCA_3D)
     
    #Getting unique labels
    u_labels = np.unique(label)
     
    #plotting the results:
    for i in u_labels:
        plt.scatter(df_3D[label == i , 0] , df_3D[label == i , 1] , label = i)
    plt.legend()
    plt.show()
    
    score = silhouette_score(df_samples_PCA_3D, kmeans.labels_)
    silhouette_avg.append(score)

#### Modelacion N° clusters - Score Silhouette

In [None]:
plt.plot(clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()
print("N° Clusters con mejor Score: ",clusters[pd.Series(silhouette_avg).idxmax()],"\nScore: ",max(silhouette_avg))

#### Modelado K-Means con N° Óptimo Clusters 3D

In [None]:
n_clusters = clusters[pd.Series(silhouette_avg).idxmax()]
kmeans = KMeans(n_clusters=n_clusters)
label = kmeans.fit_predict(df_samples_PCA_3D)
axes_PCA_3D = {'x': 'pca0', 'y': 'pca1', 'z': 'pca2'}

title = f'K-means(K={n_clusters}) over 3D PCA transformed data'
plot = Plot3D_WithLabels (df_samples_PCA_3D, kmeans.labels_, title, axes_PCA_3D, centroids_3D=kmeans.cluster_centers_)

### t-SNE 2D

#### Reducción de Dimensión

In [62]:
tsne2D = TSNE(n_components=2)
samples_TSNE_2D = tsne2D.fit_transform(dataset3)
df_samples_TSNE_2D = pd.DataFrame(data=samples_TSNE_2D, columns=["tsne0", "tsne1"])

#### Visualización t-SNE 2D

In [None]:
title = 'Original data after 2D t-SNE transform'
axes_TSNE_2D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D (samples_TSNE_2D, title, axes_TSNE_2D)

#### DBScan t-SNE 2D

##### Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_2D, 50)

##### Epsilons Score Silhouette 

In [None]:
min_samples = 50
epsilon_values = [3.9, 4, 4.05, 4.055,4.1, 4.3, 4.5, 4.7, 4.9, 5, 5.2, 5.4, 5.6, 5.8]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_2D, epsilon, min_samples=min_samples)

epsilon = 4.05
labels_TSNE_2D = aux_labels[epsilon]

##### Visualización DBScan

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
Plot2D_WithLabels (samples_TSNE_2D, labels_TSNE_2D, title, axes_TSNE_2D)

### t-SNE 3D

#### Reducción de Dimensión

In [67]:
tsne3D = TSNE(n_components=3)
samples_TSNE_3D = tsne2D.fit_transform(dataset3)
df_samples_TSNE_3D = pd.DataFrame(data=samples_TSNE_3D, columns=["tsne0", "tsne1"])

#### Modelación Nearest Neighbor

In [None]:
PlotDistancesToKnearestNeighbor(samples_TSNE_3D, 100)

### DBScan t-SNE 3D

#### Epsilons Score Silhouette

In [None]:
min_samples = 100
epsilon_values = [4.5, 5, 5.5, 5.7, 5.9, 5.95, 6, 6.1, 6.2, 6.3, 6.5, 6.6,6.7, 6.9, 7, 7.5]
aux_labels = {}
for epsilon in epsilon_values:
  aux_labels[epsilon] = ApplyDBScanToData (samples_TSNE_3D, epsilon, min_samples=min_samples)

epsilon = 6
labels_TSNE_3D = aux_labels[epsilon]

#### Visualización t-SNE 3D

In [None]:
title = f'DBScan(eps={epsilon}, MinPts={min_samples}) over 2D t-SNE transformed data'
title = 'Original data after 3D t-SNE transform'
axes_TSNE_3D = {'x': 'tsne0', 'y': 'tsne1'}
Plot2D_WithLabels (samples_TSNE_3D, labels_TSNE_3D, title, axes_TSNE_3D)

### DBScan

#### DBScan - PCA 2D

In [None]:
n_samples = 20
PlotDistancesToKnearestNeighbor(df_samples_PCA_2D, n_samples)

In [None]:
epsis = [0.01,0.09]
data_epsis = []
min_samples = 7

for epsilon in epsis:
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm="ball_tree", metric='minkowski', leaf_size=90, p=2).fit(df_samples_PCA_2D)
    #labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 1)
    n_noise_ = list(labels).count(-1)
    print(f"\nTest for epsilon = {epsilon}")
    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Silhouette Coefficient: %0.3f" % silhouette_score(df_samples_PCA_2D, labels))

#### DBScan - PCA 3D

In [None]:
n_samples = 7
PlotDistancesToKnearestNeighbor(df_samples_PCA_3D, n_samples)

In [None]:
epsis = [0.09,0.0945,0.1,0.15,0.16,0.17,0.2]
data_epsis = []
min_samples = n_samples

for epsilon in epsis:
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm="ball_tree", leaf_size=90, p=2).fit(df_samples_PCA_3D)
    labels = db.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 1)
    n_noise_ = list(labels).count(-1)
    print(f"\nTest for epsilon = {epsilon}")
    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Silhouette Coefficient: %0.3f" % silhouette_score(df_samples_PCA_3D, labels))