In [None]:
import sys
sys.path.insert(0, "../../code-2023-deephyptrails/")

In [None]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from Code.Dataset.ReviewDataset import ReviewsDataset, RFAMZWalkDataset
from Code.Models.RandoLMForest import generate, get_probabilities_per_token
from tqdm.notebook import tqdm
import pandas as pd

import hdbscan
import umap
import umap.plot

In [None]:
import warnings

warnings.filterwarnings("ignore")

In [None]:
model = pickle.load(open("deeptrails/data/amz_real_data/randoLM_fkt.pkl", "rb"))
model, model.feature_ordering

- sichere bindungsstil
- nur friendship complete => rest friendship radius
- cluster raus

In [None]:
dataset = ReviewsDataset("deeptrails/data/amz_real_data/dataset.jsonl")
dataset

In [None]:
list_of_available_features = [dict(tuple(sorted(d[0][-1].items()))) for d in dataset.annotated_walks]
list_of_available_features = list(set([tuple(sorted(d.items())) for d in list_of_available_features]))
list_of_available_features = [dict(d) for d in list_of_available_features]
len(list_of_available_features)

In [None]:
for feature_indices in list_of_available_features:
    print(generate(model, [0], feature_indices, 25))

In [None]:
ds = RFAMZWalkDataset(dataset.annotated_walks, walk_type="fkt", args=dataset.args)
probs = get_probabilities_per_token(model, ds.inputs, ds.targets)
probs_by_token = {i: [] for i in range(102)}
for p, t in zip(probs, ds.targets):
    probs_by_token[t].append(p)

probs_by_token = {i: np.mean(p) for i, p in probs_by_token.items()}
plt.figure()
plt.plot(probs_by_token.keys(), probs_by_token.values())
plt.xlabel("Token")
plt.ylabel("Probability")

In [None]:
inputs = []
targets = []
feature_indices = []
walk_indices = []

for feature_index, feature in enumerate(list_of_available_features):
    fake_dataset = ReviewsDataset("deeptrails/data/amz_real_data/dataset.jsonl")
    target_dataset = ReviewsDataset("deeptrails/data/amz_real_data/dataset.jsonl")
    for walk_index in range(len(fake_dataset.annotated_walks)):
        target_dataset.annotated_walks = [fake_dataset.annotated_walks[walk_index]]
        for walk in target_dataset.annotated_walks:
            for step in walk:
                step[-1] = feature
        fake_ds = RFAMZWalkDataset(target_dataset.annotated_walks, walk_type="fkt", args=dataset.args)
        inputs.extend(fake_ds.inputs)
        targets.extend(fake_ds.targets)
        feature_indices.extend([feature_index] * len(fake_ds.inputs))
        walk_indices.extend([walk_index] * len(fake_ds.inputs))

In [None]:
probs = get_probabilities_per_token(model, inputs, targets)
df = pd.DataFrame(
    {"walk_index": walk_indices, "feature_index": feature_indices, "input": inputs, "targets": targets, "prob": probs}
)
df

In [None]:
probability_observations_per_feature_and_walk_and_token = np.zeros(
    (
        len(list_of_available_features),
        len(dataset.annotated_walks),
        model.n_features_in_,
    )
)

for _, values in (
    df.groupby(["walk_index", "feature_index", "targets"])["prob"].mean().to_frame().reset_index().iterrows()
):
    probability_observations_per_feature_and_walk_and_token[
        int(values["feature_index"]),
        int(values["walk_index"]),
        int(values["targets"]),
    ] = values["prob"]

In [None]:
# plot heatmap of probabilities per feature and walk
plt.figure(figsize=(20, 20))
sns.heatmap(
    np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1),
    # xticklabels=range(len(dataset.annotated_walks)),
    # yticklabels=list_of_available_features,
    cmap="Blues",
    # vmin=0,
    # vmax=1,
)
plt.xlabel("Walk")
plt.ylabel("Feature combination")
plt.title("Probability of sequence given feature")
plt.savefig("code-2023-deephyptrails/data/potential-paper-figures/amz-heatmap.png")

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(
    np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1).T,
    # xticklabels=range(len(dataset.annotated_walks)),
    # yticklabels=list_of_available_features,
    cmap="Blues",
    # vmin=0,
    # vmax=1,
)

plt.xlabel("Feature combination")
plt.ylabel("Walk")

plt.title("Probability of sequence given feature")

In [None]:
feature_clusters = hdbscan.HDBSCAN(cluster_selection_method="leaf").fit(
    np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1)
)
mapper = umap.UMAP()
reduced_data = mapper.fit_transform(np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1))
reduced_cluster = hdbscan.HDBSCAN().fit(reduced_data)
umap.plot.points(mapper, labels=reduced_cluster.labels_)
plt.title("feature combinations")

In [None]:
for feature_label in range(max(feature_clusters.labels_) + 1):
    print("Feature label:", feature_label)
    print("Number of feature combinations:", len(np.where(feature_clusters.labels_ == feature_label)[0]))
    print(
        "Feature combinations:",
        "\n".join({str(list_of_available_features[i]) for i in np.where(feature_clusters.labels_ == feature_label)[0]}),
    )
    print()

In [None]:
walk_clusters = hdbscan.HDBSCAN(cluster_selection_method="leaf").fit(
    np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1).T
)
mapper = umap.UMAP()
mapper.fit_transform(np.mean(probability_observations_per_feature_and_walk_and_token, axis=-1).T)
umap.plot.points(mapper, labels=walk_clusters.labels_)
plt.title("walks")

In [None]:
for walk_label in range(max(walk_clusters.labels_) + 1):
    print("Walk label:", walk_label)
    print("Number of walks:", len(np.where(walk_clusters.labels_ == walk_label)[0]))
    print(
        "Walks:",
        "\n".join(
            {
                str([s[:-1] for s in dataset.annotated_walks[i]])
                for i in np.where(walk_clusters.labels_ == walk_label)[0]
            }
        ),
    )
    print()

In [None]:
ds = RFAMZWalkDataset(dataset.annotated_walks, walk_type="fkt", args=dataset.args)
probs = get_probabilities_per_token(model, ds.inputs, ds.targets)
probs_by_index = {i: [] for i in range(200)}
for p, walk_index in zip(probs, ds.indices):
    probs_by_index[walk_index].append(p)

probs_by_index = {i: np.mean(p) for i, p in probs_by_index.items()}

plt.figure()
plt.plot(probs_by_index.keys(), probs_by_index.values())
plt.xlabel("Index")
plt.ylabel("Probability")