In [96]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

In [10]:
df = pd.read_csv("logs_limpios.csv")

In [11]:
text_list = df["Preguntas"].to_list()

---
> ### 1: ***Stemizar y sacar las stopwords del texto***
---

In [147]:
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
import nltk
from nltk.corpus import stopwords

stemmer = SnowballStemmer('spanish')
cachedStopWords = stopwords.words('spanish')


def clean_text(text_list):
    for i in range(len(text_list)):
        text = ' '.join([stemmer.stem(word) for word in text_list[i].split() if word not in cachedStopWords])
        text_list[i] = text
    return text_list

In [149]:
text_list = clean_text(text_list)

---
> ### 2: ***Vectorizar el texto utilizando Tf-idf (TfidfVectorizer) o BOW (CountVectorizer)***

### Diferencias entre las dos
---

In [202]:
vectorizer_tfidf = TfidfVectorizer()
vectorizer_bow = CountVectorizer()

In [203]:
data = vectorizer_tfidf.fit_transform(text_list)

In [213]:
svd = TruncatedSVD(n_components=10)
svd_truncated = svd.fit_transform(data)

In [214]:
# method = 'exact' corre un algorito más exacto pero de complejidad O(N^2)

data_2d = TSNE(n_components=2, early_exaggeration=20, perplexity=50, n_iter=5000)
data_2d = data_2d.fit_transform(svd_truncated)

In [226]:
data_plot = data_2d

---
> ### Tipo de clustering: ***K-MEANS***
---

In [263]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, random_state=0).fit_predict(data_plot)

In [264]:
plotly.offline.plot(data_plot_image(data_plot, kmeans));

---> La cantidad de clusters detectados fueron de 20


---
> ### Tipo de clustering: ***Spectral Clustering***
---

In [265]:
%%time
from sklearn.cluster import SpectralClustering
spectral_clustering = SpectralClustering(n_clusters=20, assign_labels="discretize", random_state=0).fit(data_plot)

CPU times: user 38 s, sys: 6.62 s, total: 44.6 s
Wall time: 11.3 s


In [266]:
plotly.offline.plot(data_plot_image(data_plot, spectral_clustering.labels_));

---> La cantidad de clusters detectados fueron de 20


---
> ### Tipo de clustering: ***DBSCAN***
---

In [192]:
from sklearn.cluster import DBSCAN

In [267]:
# eps es el hiper-parámetro que define el radio del radar.
dbscan_clustering = DBSCAN(eps=3.3, min_samples=15).fit(data_plot)

In [None]:
plotly.offline.plot(data_plot_image(data_plot, dbscan_clustering.labels_));

---
> ### Tipo de clustering: ***HDBSCAN***
---

In [260]:
import hdbscan

In [261]:
%%time
hdbscan_cluster = hdbscan.HDBSCAN()
hdbscan_cluster = hdbscan_cluster.fit(data_plot)

CPU times: user 37.3 ms, sys: 2.65 ms, total: 39.9 ms
Wall time: 39.1 ms


In [262]:
plotly.offline.plot(data_plot_image(data_plot, hdbscan_cluster.labels_));

---> La cantidad de clusters detectados fueron de 41


---
> ### 5: ***Preparación de la data para el ploteo***
---

In [238]:
def data_plot_image(data_plot, model_cluster):
    pos_x = [x[0] for x in data_plot]
    pos_y = [x[1] for x in data_plot]
    
    dic_plot = {}

    for i in range(len(model_cluster)):
        if model_cluster[i] not in dic_plot:
            dic_plot[model_cluster[i]] = [[pos_x[i]],[pos_y[i]], [model_cluster[i]]]
        # Agrego la posición en X
        dic_plot[model_cluster[i]][0].append(pos_x[i])  
        # Agrego la posición en Y
        dic_plot[model_cluster[i]][1].append(pos_y[i])    
        # Agrego el número de "cluster"
        dic_plot[model_cluster[i]][2].append(clustering[i])
        
    # Cantidad de intenciones detectadas
    amount_clusters = len(dic_plot.keys())
    
    import plotly.offline
    import plotly.graph_objs as go

    plot = []
    for intents in dic_plot:
        trace = go.Scatter(
            x = dic_plot[intents][0],
            y = dic_plot[intents][1],
            text = dic_plot[intents][2],
            mode = 'markers',
            marker = dict(
                size = 7
            )
        )
        plot.append(trace)
        
    print(f"---> La cantidad de clusters detectados fueron de {amount_clusters}")    
    return plot