In [7]:
import pandas as pd
import time
from tqdm import tqdm_notebook
import pickle
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True) # initiate notebook for offline plot

with open("data/Wordvectors.pkl", "rb") as f:
    wordvectors = pickle.load(f)
    
print(len(wordvectors))
wordvectors = pd.DataFrame(wordvectors)


28577


In [2]:
# Do kmeans
number_clusters = 10
start_time = time.time()
kmeans = KMeans(n_clusters=number_clusters).fit(wordvectors)
print(f"Kmeans took {time.time()-start_time:.3f} seconds")
wordvector_labels = kmeans.labels_

Kmeans took 16.739 seconds


In [30]:
rndperm = np.random.permutation(wordvectors.shape[0])
N = 10000
n_indx = rndperm

df_subset = wordvectors.loc[n_indx,:].copy()
start_time = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)

tsne_results = tsne.fit_transform(df_subset)
print(f"t-SNE took {time.time()-start_time:.3f} seconds")

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 28577 samples in 0.618s...
[t-SNE] Computed neighbors for 28577 samples in 190.908s...
[t-SNE] Computed conditional probabilities for sample 1000 / 28577
[t-SNE] Computed conditional probabilities for sample 2000 / 28577
[t-SNE] Computed conditional probabilities for sample 3000 / 28577
[t-SNE] Computed conditional probabilities for sample 4000 / 28577
[t-SNE] Computed conditional probabilities for sample 5000 / 28577
[t-SNE] Computed conditional probabilities for sample 6000 / 28577
[t-SNE] Computed conditional probabilities for sample 7000 / 28577
[t-SNE] Computed conditional probabilities for sample 8000 / 28577
[t-SNE] Computed conditional probabilities for sample 9000 / 28577
[t-SNE] Computed conditional probabilities for sample 10000 / 28577
[t-SNE] Computed conditional probabilities for sample 11000 / 28577
[t-SNE] Computed conditional probabilities for sample 12000 / 28577
[t-SNE] Computed conditional probabilities for 

In [33]:
print(n_indx)
print(wordvector_labels[n_indx])
df_subset['tsne-one'] = tsne_results[:,0]
df_subset['tsne-two'] = tsne_results[:,1]
# df_subset['tsne-three'] = tsne_results[:,2]
df_subset["class"] = wordvector_labels[n_indx]
# plt.figure(figsize=(16,10))
# sns.scatterplot(
#     x="tsne-2d-one", y="tsne-2d-two",
#     hue="class",
#     palette=sns.color_palette(n_colors=number_clusters),
#     data=df_subset,
#     legend="full",
#     alpha=0.3
# )
# plt.show()

trace1 = go.Scattergl(x=df_subset['tsne-one'], 
                    y=df_subset['tsne-two'],
#                     z=df_subset['tsne-three'],
                    text=df_subset["class"],
                    mode="markers",
                    marker=dict(
                        color=df_subset["class"],
                        opacity=0.8,
                        colorscale="viridis",
                        showscale=True
                        )
                   )

layout = go.Layout(title="t-SNE analysis")
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)

[16622 18765 24729 ... 18175 23989  9525]
[9 9 4 ... 4 2 7]


In [6]:
with open("data/Contexts.pkl", "rb") as f:
    prompts = pickle.load(f)
print(len(prompts))

# compare prompts from class 8 and 3 (farthest away)
class_ind1 = 5
class_ind2 = 3

samples = 10
for i in range(samples):
    print(f"Class {class_ind1} {prompts[rndperm[wordvector_labels==class_ind1][i]]}")
print()
for i in range(samples):
    print(f"Class {class_ind2} {prompts[rndperm[wordvector_labels==class_ind2][i]]}")

28577
Class 5 arranging dinner parties. Monica Geller thinks that polite society is
Class 5 promoting conservative values. Richard Nixon doesn't think that progressives are
Class 5 singing pop songs. Taylor Swift thinks that Eurovision is
Class 5 winning heavyweight titles. Mike Tyson thinks that knock outs are
Class 5 doing stand-up. Russell Brand thinks that humour is
Class 5 laughing at God. Lucifer doesn't think that dogma is
Class 5 making trains run on time. Benito Mussolini thinks that timetables are
Class 5 breaking bones. Bane thinks that fractures are
Class 5 solving mysteries. Tony DiNozzo doesn't think that cold cases are
Class 5 interpreting evidence. Inspector Jacques Clouseau thinks that fingerprints are

Class 3 spying for the enemy. Mata Hari thinks that espionage is
Class 3 modeling clothes. Kate Moss thinks that the fashion industry is
Class 3 testing scientific theories. Benjamin Franklin doesn't think that pseudo-science is
Class 3 promoting diversity. Professor Ch