In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import plotly.offline
import plotly.graph_objs as go

In [None]:
df = pd.read_csv("logs_limpios.csv")

---
> ### 0: ***Preparación de la data para el ploteo***
---

---
> #### 0.1: ***Limpieza de los ejemplos***
---

In [None]:
df["Pregunta"] = df["Pregunta"].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [None]:
def data_plot_image(data_plot, model_cluster, text):
    pos_x = [x[0] for x in data_plot]
    pos_y = [x[1] for x in data_plot]
    
    dic_plot = {}

    for i in range(len(model_cluster)):
        if model_cluster[i] not in dic_plot:
            dic_plot[model_cluster[i]] = [[pos_x[i]],[pos_y[i]], [model_cluster[i]], [text[i]]]
        # Agrego la posición en X
        dic_plot[model_cluster[i]][0].append(pos_x[i])  
        # Agrego la posición en Y
        dic_plot[model_cluster[i]][1].append(pos_y[i])    
        # Agrego el número de "cluster"
        dic_plot[model_cluster[i]][2].append(model_cluster[i])
        # Agrego la pregunta
        dic_plot[model_cluster[i]][3].append(text[i])
        
    # Cantidad de intenciones detectadas
    amount_clusters = len(dic_plot.keys())

    plot = []
    for intents in dic_plot:
        trace = go.Scatter(
            x = dic_plot[intents][0],
            y = dic_plot[intents][1],
            text = dic_plot[intents][3],
            name = str(intents),
            mode = 'markers',
            marker = dict(
                size = 7
            )
        )
        plot.append(trace)
        
    print(f"---> La cantidad de clusters detectados fueron de {amount_clusters}")    
    return plot

---
> ### 1: ***Stemizar y sacar las stopwords del texto***
---

In [None]:
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import nltk
from nltk.corpus import stopwords

stemmer = SnowballStemmer('spanish')
cachedStopWords = stopwords.words('spanish')

stopwords_add = ['buenos','dias', 'hola', 'gracias', 'muchas', 'ok']
cachedStopWords.extend(stopwords_add)


def clean_text(text_list):
    for i in range(len(text_list)):
        text = ' '.join([word for word in text_list[i].split() if word.lower() not in cachedStopWords]) #[stemmer.stem(word) for word in text_list[i].split() if word not in cachedStopWords])
        text_list[i] = text
    return text_list

In [None]:
text_list = clean_text(text_list_complete)

---
> ### 2: ***Vectorizar el texto utilizando Tf-idf (TfidfVectorizer) o BOW (CountVectorizer)***

### Diferencias entre las dos
---

In [None]:
vectorizer_tfidf = TfidfVectorizer(ngram_range=(1,2))
vectorizer_bow = CountVectorizer(ngram_range=(1,1))

In [None]:
data = vectorizer_tfidf.fit_transform(text_list)

---
> ### ***Gráfico de energía***
---

In [None]:
from scipy.sparse.linalg import eigs

matrix = np.matmul(data.toarray(), data.toarray().transpose())
vals, vecs = eigs(matrix, 20)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
plt.plot(vals);

---
> ### 3: ***Aplicar SVD/PCA y reducir las dimensiones dependiendo del gráfico de energía***
---

In [None]:
svd = TruncatedSVD(n_components=3)
svd_truncated = svd.fit_transform(data)

---
> ### 3.1: ***Aplicar T-SNE para reducir todo a 2 dimensiones***
---

In [None]:
# method = 'exact' corre un algorito más exacto pero de complejidad O(N^2)

data_2d = TSNE(n_components=2, n_iter=1000, metric='cosine')
data_2d = data_2d.fit_transform(svd_truncated)
data_plot = data_2d

---
> ### 4: ***Ploteo la data para ver qué algoritmo de clustering utilizar***
---

In [None]:
plot = []
pos_x = [x[0] for x in data_plot]
pos_y = [x[1] for x in data_plot]

trace = go.Scatter(
    x = pos_x,
    y = pos_y,
    text = df_caja["Pregunta"].to_list(),
    mode = 'markers',
    marker = dict(
        size = 4
    )
)
plot.append(trace)
plotly.offline.plot(plot)

---
> ### Tipo de clustering: ***K-MEANS++***
---

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=15, random_state=0).fit_predict(data_plot)

In [None]:
plotly.offline.plot(data_plot_image(data_plot, kmeans, text_list_complete));

---
> ### Tipo de clustering: ***Spectral Clustering***
---

In [None]:
%%time
from sklearn.cluster import SpectralClustering
spectral_clustering = SpectralClustering(n_clusters=15, assign_labels="discretize", random_state=0).fit(data_plot)

In [None]:
plotly.offline.plot(data_plot_image(data_plot, spectral_clustering.labels_, text_list_complete));

---
> ### Tipo de clustering: ***DBSCAN***
---

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# eps es el hiper-parámetro que define el radio del radar.
dbscan_clustering = DBSCAN(eps=4, min_samples=15).fit(data_plot)

In [None]:
plotly.offline.plot(data_plot_image(data_plot, dbscan_clustering.labels_, text_list_complete));

---
> ### Tipo de clustering: ***HDBSCAN***
---

In [None]:
import hdbscan

In [None]:
%%time
hdbscan_cluster = hdbscan.HDBSCAN()
hdbscan_cluster = hdbscan_cluster.fit(data_plot)

In [None]:
plotly.offline.plot(data_plot_image(data_plot, hdbscan_cluster.labels_, text_list_complete));

---
> ### Clustering: ***Datos en N dimensiones***
---