In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
import numpy as np
import yaml
from data import ParsedCorpus
%matplotlib inline

init_notebook_mode(connected=True)

In [None]:
with open("setting.yaml", "r") as stream:
    setting = yaml.load(stream)

In [None]:
base_dirs = [setting["parsed_data_path"]["dev"]]
print("base_dirs are", base_dirs)

In [None]:
corpus = ParsedCorpus(base_dirs)

In [None]:
fn2answer = {}

for answer, fn in corpus.get_single("answer"):
    fn2answer[fn] = answer

## T-SNE

In [None]:
events = []
fns = []
for fn, answer in fn2answer.items():
    event = answer["mean"]
    events.append(event)
    fns.append(fn)
events = np.array(events)

events.shape

In [None]:
from sklearn.manifold import TSNE

events_2d = TSNE(n_components=2).fit_transform(events)

In [None]:
events_X, events_Y = events_2d[:, 0], events_2d[:, 1]

In [None]:
iplot([go.Scatter(x=events_X, y=events_Y, mode="markers")])

## Spectral Clustering

In [None]:
from sklearn.cluster import SpectralClustering

y_pred = SpectralClustering().fit_predict(events)

from sklearn import metrics
print("Calinski-Harabasz Score", metrics.calinski_harabasz_score(events, y_pred))

In [None]:
for index, gamma in enumerate((0.01, 0.1, 1, 10)):
    for index, k in enumerate((4, 5, 6, 7)):
        y_pred = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(events)
        print("Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", k,"score:", metrics.calinski_harabasz_score(events, y_pred))

In [None]:
y_pred = SpectralClustering(gamma=0.01, n_clusters=4).fit_predict(events)
print("Calinski-Harabasz Score", metrics.calinski_harabasz_score(events, y_pred))

In [None]:
id2xs = {}
id2ys = {}
id2fns = {}

for x, y, fn, index in zip(events_X, events_Y, fns, y_pred):
    if index not in id2xs:
        id2xs[index] = []
        id2ys[index] = []
        id2fns[index] = []
    id2xs[index].append(x)
    id2ys[index].append(y)
    id2fns[index].append(fn)

iplot([go.Scatter(x=id2xs[index], y=id2ys[index], mode="markers") for index in id2xs.keys()])

## Looking into news groups

In [None]:
for k in sorted(id2fns.keys()):
    print("GROUP %d" % k)
    for x in sorted(id2fns[k]):
        print(x)
    print()