In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import gensim.models.word2vec as w2v
import pickle
import numpy as np
from scipy.stats import pearsonr

In [None]:
# Load Twitch emotes
with open('emote_dict', 'rb') as f:
    emotes = pickle.load(f)
    emotes.replace(['NaN', 'nan'], np.nan, inplace = True)

# Compare Models

In [None]:
# Here are some example models you could train and compare using the dataset provided
model = w2v.Word2Vec.load(r'models\DoubleLift\model')
dbl_embeddings = model.wv
del model

model = w2v.Word2Vec.load(r'models\LCS\model')
lcs_embeddings = model.wv
del model

model = w2v.Word2Vec.load(r'models\LEC\model')
lec_embeddings = model.wv
del model

model = w2v.Word2Vec.load(r'models\Nightblue3\model')
nb3_embeddings = model.wv
del model

model = w2v.Word2Vec.load(r'models\Sneaky\model')
sneaky_embeddings = model.wv
del model

In [None]:
def similarity_between_emotes(emote_list, embeddings):
    # Create a dictionary to store the emote vectors
    emote_vectors = {}
    for emote in emote_list:
        if emote in embeddings.index_to_key:
            emote_vectors[emote] = embeddings[emote]

    # Compute pairwise similarity scores between emotes
    similarity_scores = {}
    for i, emote1 in enumerate(emote_vectors):
        for j, emote2 in enumerate(emote_vectors):
            if i < j: # Only do each pair once, and skip equal comparisons
                similarity_scores[(emote1, emote2)] = cosine_similarity(
                    emote_vectors[emote1].reshape(1, -1),       # Swap rows/columns (1 col x many rows -> 1 row x many cols)
                    emote_vectors[emote2].reshape(1, -1))[0][0] # Returns a single value nested in 2D array

    return similarity_scores

In [None]:
def compare_models(model1: w2v.KeyedVectors, model2: w2v.KeyedVectors):
    # Create a set of emotes that are present in both model embeddings
    emotes_in_both = set(model1.index_to_key).intersection(set(model2.index_to_key))
    emotes_in_both = [x for x in emotes['GLOBAL_TWITCH'].str.lower() if x in emotes_in_both]

    # Compute similarity scores between global Twitch emotes in the two embeddings
    model1_similarities = similarity_between_emotes(emotes_in_both, model1)
    model2_similarities = similarity_between_emotes(emotes_in_both, model2)

    # Compute aggregate statistics on the similarity scores
    model1_scores = np.array(list(model1_similarities.values()))
    model2_scores = np.array(list(model2_similarities.values()))
    scores_diff = model1_scores - model2_scores

    mean_diff = np.mean(scores_diff)
    std_dev_diff = np.std(scores_diff)
    corr_coef, _ = pearsonr(model1_scores, model2_scores)

    # Print the aggregate statistics
    print("Aggregate statistics of similarity scores between global Twitch emotes in the two embeddings:")
    print(f"Mean difference: {mean_diff:.3f}")
    print(f"Standard deviation of difference: {std_dev_diff:.3f}")
    print(f"Correlation coefficient: {corr_coef:.3f}")

    return mean_diff, std_dev_diff, corr_coef

In [None]:
print('NB3 model : Sneaky model')
compare_models(nb3_embeddings, sneaky_embeddings)
print()
print('NB3 model : Dbl model')
compare_models(nb3_embeddings, dbl_embeddings)
print()
print('Sneaky model : Dbl model')
compare_models(sneaky_embeddings, dbl_embeddings)

In [None]:
print('LCS model : Sneaky model')
compare_models(lcs_embeddings, sneaky_embeddings)
print()
print('LCS model : Dbl model')
compare_models(lcs_embeddings, dbl_embeddings)
print()
print('LCS model : NB3 model')
compare_models(lcs_embeddings, nb3_embeddings)

In [None]:
print('LEC model : Sneaky model')
compare_models(lec_embeddings, sneaky_embeddings)
print()
print('LEC model : Dbl model')
compare_models(lec_embeddings, dbl_embeddings)
print()
print('LEC model : NB3 model')
compare_models(lec_embeddings, nb3_embeddings)

## Clustering Analysis

In [None]:
# Choose a model to analyze
scan_embeddings = twitch_500_all

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Create a list of word vectors
X = np.array([scan_embeddings[word] for word in scan_embeddings.index_to_key])

# Loop through a range of cluster sizes
for n_clusters in range(2,10):
    # Fit the KMeans model to the word vectors
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    # Calculate the silhouette score
    score = silhouette_score(X, kmeans.labels_)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :", score)

In [None]:
# Saved output

# Silhouette Scores from KMeans:
#
# For n_clusters = 2 The average silhouette_score is : 0.29998946
# For n_clusters = 3 The average silhouette_score is : 0.34632137
# For n_clusters = 4 The average silhouette_score is : 0.0039549596
# For n_clusters = 5 The average silhouette_score is : 0.009397833
# For n_clusters = 6 The average silhouette_score is : 0.011708869
# For n_clusters = 7 The average silhouette_score is : 0.022252025
# For n_clusters = 8 The average silhouette_score is : 0.023897517
# For n_clusters = 9 The average silhouette_score is : 0.024429243
# For n_clusters = 10 The average silhouette_score is : 0.022800706
# For n_clusters = 20 The average silhouette_score is : -0.04120358
# For n_clusters = 30 The average silhouette_score is : -0.027650462
# For n_clusters = 40 The average silhouette_score is : -0.030612223
# For n_clusters = 50 The average silhouette_score is : -0.06241339
# For n_clusters = 60 The average silhouette_score is : -0.059461888
# For n_clusters = 70 The average silhouette_score is : -0.048446592
# For n_clusters = 80 The average silhouette_score is : -0.070959754
# For n_clusters = 90 The average silhouette_score is : -0.06987077
# For n_clusters = 100 The average silhouette_score is : -0.07816283
# For n_clusters = 200 The average silhouette_score is : -0.08500795
# For n_clusters = 300 The average silhouette_score is : -0.094385035
# For n_clusters = 400 The average silhouette_score is : -0.08832128
# For n_clusters = 500 The average silhouette_score is : -0.095110245
# For n_clusters = 1000 The average silhouette_score is : -0.09550029
# For n_clusters = 2000 The average silhouette_score is : -0.09596107
# For n_clusters = 5000 The average silhouette_score is : -0.119661234

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Apply t-SNE to reduce the dimensionality of the word embeddings
tsne = TSNE(n_components=3, random_state=0)
embeddings_2d = tsne.fit_transform(X)

# Plot the t-SNE output
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1])
plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt

dbscan = DBSCAN(eps=0.5, min_samples=20).fit(scan_embeddings.vectors)

labels = dbscan.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters)
print("Estimated number of noise points: %d" % n_noise)

print(f"Silhouette Coefficient: {silhouette_score(scan_embeddings.vectors, labels):.3f}")

In [None]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


clustering = AgglomerativeClustering(distance_threshold=0, 
                                     n_clusters=None).fit(scan_embeddings.vectors)

plt.figure(figsize=(15, 5))
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(clustering, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

range_n_clusters = range(2, 30)
for n_clusters in range_n_clusters:
    clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(scan_embeddings.vectors)

    labels = clustering.labels_
    silhouette_avg = silhouette_score(scan_embeddings.vectors, labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

In [None]:
from sklearn.mixture import GaussianMixture

n_components = np.arange(2, 30)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(scan_embeddings.vectors)
          for n in n_components]

plt.plot(n_components, [m.bic(scan_embeddings.vectors) for m in models], 
         '-mD', markevery=[6], label='BIC')
plt.plot(n_components, [m.aic(scan_embeddings.vectors) for m in models], 
         '-yD', markevery=[6], label='AIC')
plt.legend(loc='best')
plt.xlabel('n_components')

clustering = GaussianMixture(6, covariance_type='full', 
                             random_state=0).fit(scan_embeddings.vectors)