In [1]:
%load_ext autoreload

- This notebook is to explore what different AA models capture in texts when they are trained on the AA task (style?, topic?, discours?)
- We will use different AA models (StyleDistance, Wegmann, LUAR, etc.) and a sample from the HRS dataset.
- Method:
    - We cluster documents using the given AA model
    - Describe each cluster through a distribution over style, topic, dicourse, etc.
    - Quanitfy the alignment between these clusters and the corresponding distributions

- How to quantify the alignment?
    - Compute Avg similarity between distribution within the cluster vs. across the clusters
    - Compute correlation between cluster's distances and the corresponding representation 

In [2]:
import pandas as pd
import json
import sklearn
import glob
import pickle
import sys
import numpy as np
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from statsmodels.stats.dist_dependence_measures import distance_correlation
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [67]:
%autoreload
sys.path.append('../../style_generation_pipeline/')

import generate_explanations
from utils import *
from data import *

In [4]:
path='/mnt/swordfish-pool2/milad/hiatus-data/phase_1/'

------------

In [191]:
def get_topic_representations(texts):
    topic_model = BERTopic(representation_model = KeyBERTInspired(), embedding_model="all-MiniLM-L6-v2")
    topic_model = topic_model.fit(texts)
    documents_props, _ = topic_model.approximate_distribution(texts)
    return documents_props

def get_cluster_topic_representations(cluster_df):
    document_texts = cluster_df.fullText.tolist()
    
    topic_model = BERTopic(representation_model = KeyBERTInspired(), embedding_model="all-MiniLM-L6-v2")
    topic_model = topic_model.fit(document_texts)
    
    cluster_to_documents_df = cluster_df.groupby('cluster_label').agg({'fullText': lambda x: list(x)}).reset_index()
    cluster_topic_reps = []
    for idx, row in cluster_to_documents_df.iterrows():
        documents_props, _ = topic_model.approximate_distribution(row['fullText'])
        #print(row['cluster_label'])
        #print(documents_props[0].shape)
        cluster_topic_reps.append(np.mean(documents_props, axis=0))

    return np.array(cluster_topic_reps)

def compute_style_overlap(interp_space, top_f=10):
    style_top_rep = [[y[0] for y in sorted(x[1][1].items(), key=lambda x:-x[1])[:top_f]] for x in interp_space.items()]
    perc_inters = 0
    total=0
    for i, rep1 in enumerate(style_top_rep):
        for j, rep2 in enumerate(style_top_rep):
            if i != j:
                reps_intersection = len(set(rep1).intersection(rep2))/(len(rep1) + len(rep2))
                #print(reps_intersection)
                #print(reps_intersection)
                perc_inters+= len(set(rep1).intersection(rep2))/(len(rep1) + len(rep2))
                total+=1

    return round(perc_inters/total, 3)

def analyze_correlation(model_interp_space, model_clustering_df, styles_df):
    # Compute latent, topic, and style representations
    sd_centroids  = np.array([val[0] for key, val in model_interp_space.items()])
    sd_style_dist = get_cluster_style_representations(styles_df, model_clustering_df)
    sd_topic_dist = get_cluster_topic_representations(model_clustering_df)

    # Compute pairwise distance according to each representation
    pairwise_style_dist  = cosine_distances(sd_style_dist)
    pairwise_interp_dist = cosine_distances(sd_centroids)
    pairwise_topic_dist  = cosine_distances(sd_topic_dist)

    style_corr= round(distance_correlation(pairwise_interp_dist, pairwise_style_dist), 2)
    topic_corr= round(distance_correlation(pairwise_interp_dist, pairwise_topic_dist), 2)
    # Compute the distance correlation
    print(style_corr, topic_corr)

    # Compute the style and topic representation for each document
    document_texts = model_clustering_df.fullText.tolist()
    document_ids   = model_clustering_df.documentID.tolist()
    doc_topic_reps = get_topic_representations(document_texts)
    doc_style_reps = get_document_style_representations(styles_df, document_ids)
    
    doc_topic_reps = {document_id: doc_topic_reps[i] for i, document_id in enumerate(document_ids)}
    doc_style_reps = {document_id: doc_style_reps[i] for i, document_id in enumerate(document_ids)}

    # Compute the average similarity between documents' representation in the same cluster
    avg_topic_sim = []
    avg_style_sim = []
    for cluster_id in model_clustering_df.cluster_label.unique():
        cluster_document_ids = model_clustering_df[model_clustering_df.cluster_label == cluster_id].documentID.tolist()
        other_document_ids   = model_clustering_df[model_clustering_df.cluster_label != cluster_id].documentID.tolist()
        #print(cluster_document_ids, other_document_ids)
        sim_between_clusters = np.mean(cosine_similarity([doc_topic_reps[i] for i in cluster_document_ids], [doc_topic_reps[i] for i in other_document_ids]))
        sim_within_cluster   = np.mean(cosine_similarity([doc_topic_reps[i] for i in cluster_document_ids]))
        avg_topic_sim.append(sim_within_cluster/sim_between_clusters)

        sim_between_clusters = np.mean(cosine_similarity([doc_style_reps[i] for i in cluster_document_ids], [doc_style_reps[i] for i in other_document_ids]))
        sim_within_cluster   = np.mean(cosine_similarity([doc_style_reps[i] for i in cluster_document_ids]))       
        avg_style_sim.append(sim_within_cluster/sim_between_clusters)

    avg_topic_sim = round(np.mean(avg_topic_sim), 2)
    avg_style_sim = round(np.mean(avg_style_sim), 2)
    
    print(avg_style_sim, avg_topic_sim)

    #compute top-style feats representation overlap between the clusters
    top_style_feats_overlap = compute_style_overlap(model_interp_space, top_f=10)
    
    return avg_topic_sim, topic_corr, avg_style_sim, style_corr, top_style_feats_overlap

### Generating clusters using Milad-TA2: 

In [137]:
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/" \
--model aa_model-luar \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv" --top_k_feats 10 --eps 0.16

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Generating clusters using StyleDistance: 

In [19]:
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/styledistance_explanations/" \
--model styledistance \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You try to use a model that was created with version 2.7.0, however, your version is 2.5.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 2.7.0, however, your version is 2.5.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Testing Different Epsilon Values:   0%|                  | 0/99 [00:00<?, ?it/s][(0.209, 0.073, 0.273), 71] 0.01
Testing Different Epsilon Values:   1%|1         | 1/99 [00:31<50:44, 31.07s/it][(0.223, 0.067, 0.263), 16] 0.02
Testing Different Epsilon Values:   2%|2         | 2/99 [01:02<50:11, 31.05s/it][(0.271, 0.052, 0.242), 8] 0.03
Testing Different Epsilon Values:   3%|3         | 3/99 [01:33<49:37, 31.02s/it][(0.329, 0.017, 0.192), 3] 0.04
Testing Different Epsilon Values:   4%|4         | 4/99 [02:04<49:07, 31.03s/it][(0.319, 0.027, 0.209), 4] 0.05
Testing Different Epsilon Values:   5%|5         |

In [134]:
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/styledistance_explanations/" \
--model styledistance \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv" --top_k_feats 10 --eps 0.01

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You try to use a model that was created with version 2.7.0, however, your version is 2.5.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



You try to use a model that was created with version 2.7.0, however, your version is 2.5.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





### Generating clusters using Semantic Models (all-MiniLM-L6-v2): 

This is just to have a baseline for comparing. How much of the topic vs style correlation is there

In [25]:
# We can't find best clustering here, since we are comparing to ground-truth labels that reflect authorship attribution not topic. So let us just take a default value of 0.5
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/semantic_explanations/" \
--model semantic_model \
--model_path "all-MiniLM-L6-v2" \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Testing Different Epsilon Values:   0%|                  | 0/99 [00:00<?, ?it/s][(0.5, 0.002, 0.16), 1] 0.01
Testing Different Epsilon Values:   1%|1         | 1/99 [00:12<21:04, 12.90s/it][(0.5, 0.002, 0.16), 1] 0.02
Testing Different Epsilon Values:   2%|2         | 2/99 [00:25<20:51, 12.90s/it][(0.5, 0.002, 0.16), 1] 0.03
Testing Different Epsilon Values:   3%|3         | 3/99 [00:38<20:38, 12.90s/it][(0.5, 0.002, 0.16), 1] 0.04
Testing Different Epsilon Values:   4%|4         | 4/99 [00:51<20:25, 12.90s/it][(0.5, 0.002, 0.16), 1] 0.05
Testing Different Epsilon Values:   5%|5         | 5/99 [01:04<20:12, 12.89s/it][(0.5, 0.002, 0.16), 1] 0.060000000000000005
Testing Different Epsilon Values:   6%|6         | 6/99 [01:17<19:58, 12.89s/it][(0.5, 0.002, 0.16), 1] 0.06999999999999999
Testing Different Epsilon Values:   7%|7         | 7/99 [01:30<19:45, 12.89s/it][(0.5, 0.002, 0.16), 1] 0.08
Testing Different Epsilon Values:   8%|8         | 8/99 [01:43<19:32, 12.89s/it][(0.5, 0.002, 0.1

In [138]:
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/semantic_explanations/" \
--model semantic_model \
--model_path "all-MiniLM-L6-v2" \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv" --top_k_feats 10 --eps 0.5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### What do models capture?

In [139]:
styles_path = "/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/refined_and_aggregated_features_final.csv"
styles_df   = pd.read_csv(styles_path)
training_df = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_candidates_and_queries.jsonl', lines=True)

model_clusterings = {
    'styledistance': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_1/styledistance_explanations/'],
    'ta2_hrs': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/'],
    'semantic': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_1/semantic_explanations/'],
}

for key, val in model_clusterings.items():
    # style-distance clustering
    interp_space = pd.read_pickle(val[0] + "/interpretable_space.pkl")
    clustering_df  = pd.read_pickle(val[0] + "/training_authors_documents.pkl")
    # DBSCAN generate a cluster -1 of all outliers. We don't want this cluster
    del interp_space[-1]
    clustering_df = clustering_df[clustering_df.cluster_label != -1]
    
    model_clusterings[key].append(interp_space)
    model_clusterings[key].append(clustering_df)

In [189]:
model = 'ta2_hrs'
cluster_documents_df = model_clusterings[model][2].groupby('cluster_label').agg({'documentID': lambda x: len(list(x))}).reset_index()
interp_space_df = pd.DataFrame([{'cluster_label': x[0], 'style_feats': sorted(x[1][1].items(), key=lambda x:-x[1])} for x in model_clusterings[model][1].items()])
interp_space_df['num_feats'] = interp_space_df.style_feats.apply(lambda x: len(x))
interp_space_df['num_feats'] = interp_space_df.cluster_label.apply(lambda x: cluster_documents_df[cluster_documents_df.cluster_label==x]['documentID'].tolist()[0])

In [190]:
interp_space_df.head(n=10)

Unnamed: 0,cluster_label,style_feats,num_feats
0,0,"[(technical language, 1009.8526756281261), (style is professional, 914.9718378792628), (style is formal, 910.2397054539705), (casual language, 820.9818080163645), (diverse sentence lengths, 660.5148621000728), (complex sentence structures, 483.2371057535076), (specialized language, 474.8632995820517), (simple sentence structures, 474.7968698254498), (diverse sentence structures, 463.2258508093673), (consistent verb tense, 400.4899182352229), (formal tone, 386.4420264088066), (technical terms, 341.2594263048606), (additional information, 331.6098630795412), (complex ideas, 293.9998719638971), (complex concepts, 271.64372237323744), (consistent style, 243.58607384048995), (various letter cases, 235.4010519902901), (specialized terminology, 205.36513404432188), (distinguish elements, 178.61819867106698), (mathematical notation, 175.8254859790866), (abstract ideas, 174.35828339173167), (various elements, 165.3241754582177), (specialized notation, 163.48837752568846), (better balance, 162.50057019512087), (various sentence structures, 162.08526753426588), (extra information, 157.3445447423922), (various techniques, 140.01501645497567), (subordinate clauses, 139.71332471822413), (different letter cases, 137.65647335237313), (different tenses, 137.08538862781245), (technical elements, 133.75601904442655), (different types, 123.67003733780933), (formal style, 122.97912191949146), (further explanation, 116.68940132289781), (additional explanation, 114.03553522016405), (technical terminology, 112.4206395798609), (various communication methods, 105.43019403331583), (consistent use, 103.17817916061342), (professional tone, 102.960898437783), (consistent notation, 102.55064775016093), (mathematical concepts, 100.80345907872106), (passive voice, 98.8512892957298), (writing is concise, 93.99471961116578), (abstract concepts, 93.42193001467912), (full words instead, 91.46029286120181), (various symbols, 90.85241468882033), (various sentence types, 89.49591535378957), (wide range, 88.22899852964501), (grammatical structures, 86.65513738295311), (tense usage, 85.82753241720421), (precise language, 85.82708094155267), (mathematical symbols, 83.8322705679984), (multiple elements, 78.4989661660417), (technical content, 77.87349830470313), (plain text, 77.3130016207041), (effective communication, 75.51001092778675), (various methods, 75.19577568893263), (overall rhythm, 75.17368664015004), (different elements, 73.82086368594945), (informal language, 72.76450660959097), (varied sentence lengths, 67.7798119648139), (mathematical elements, 67.230730432401), (straightforward sentence structures, 66.70734295992324), (unexplained abbreviations, 66.3291844894817), (specific language, 66.20641357074105), (effective sentence structures, 65.16481028487522), (content is consistently structured, 64.64344835356685), (clear explanations, 64.46934485899843), (difficult concepts, 64.22757688509189), (writing is clear, 64.09415484385059), (different forms, 63.25609223512153), (different letter case variations, 63.25609223512153), (foreign language phrases, 62.63515639173897), (lower case letters, 61.50868810515291), (specific details, 58.833223884882784), (consistent pattern, 58.173644206964816), (improved comprehension, 57.3450764076199), (intricate elements, 56.22763754233025), (content is difficult, 53.116493365086924), (mathematical notations, 53.116493365086924), (style is sophisticated, 52.94990149639451), (different sentence structures, 52.75363880940391), (smooth reading experience, 52.734149566113516), (specialized content, 52.29201211549225), (verb forms, 51.44787379190584), (specific symbols, 50.88731724725719), (verb tenses, 50.85300751465516), (subject matter, 50.68246009785069), (particular field, 50.278237608329775), (academic disciplines, 49.71793865461502), (limited range, 48.883885292950104), (different techniques, 47.79722876145254), (unique elements, 47.254515319070904), (common language rules, 47.254515319070904), (information is consistently clear, 45.528422884360225), (style is descriptive, 45.528422884360225), (better transitions, 45.528422884360225), (english text, 45.528422884360225), (clear sentence structures, 45.50761440081365), (intricate concepts, 44.85655209488097), ...]",1662
1,1,"[(better engagement, 6.9594618213043296), (different techniques, 5.310803195716948), (relaxed style, 5.285485387732658), (intricate sentence structures, 4.957981821094206), (diverse sentence structures, 3.9818841043785156), (various sentence structures, 3.9056690972112262), (different sentence structures, 3.5169092539602604), (informal language, 3.4649765052186177), (subordinate clauses, 3.2491470864703285), (extra information, 3.146890894847844), (better balance, 2.9545558217294703), (specialized language, 2.7933135269532454), (style is professional, 2.453007608255396), (formal tone, 2.4304529962818027), (casual language, 2.3456623086181843), (complex sentence structures, 2.1669825370112448), (overall rhythm, 2.147819618290001), (style is formal, 1.6950460064319748), (diverse sentence lengths, 1.5801790959331887), (consistent verb tense, 1.0794876502297113)]",6
2,2,"[(diverse sentence structures, 88.92874499778684), (various sentence structures, 82.01905104143574), (style is formal, 76.27707028943887), (smooth reading experience, 61.523174493799104), (consistent verb tense, 60.45130841286383), (informal language, 58.9046005887165), (varied sentence lengths, 54.223849571851126), (relaxed style, 52.854853877326576), (author is informal, 49.43812277259813), (style is casual, 39.92763064996758), (precise language, 39.61249889610123), (formal tone, 38.88724794050884), (different language styles, 36.30090645986773), (casual language, 35.18493462927276), (different sentence structures, 35.1690925396026), (everyday language, 35.03322375550467), (overall rhythm, 32.21729427435002), (subjective viewpoint, 31.858375782011056), (informal style, 30.855022304700302), (structure is somewhat complex, 29.086822103482408), (better rhythm, 28.687207144318982), (diverse sentence lengths, 28.4432237267974), (extra remarks, 24.934099271685575), (structure is balanced, 23.914530273170413), (complex sentence structures, 23.836807907123692), (technical language, 22.566540237500025), (literary devices, 21.91142912152046), (simple sentence structures, 21.697706416754425), (style is informal, 20.20956602226112), (multiple languages, 20.1692191297203), (visual appeal, 19.978732751778537), (descriptive language, 19.835342245079268), (specialized language, 19.55319468867272), (varied sentence structures, 18.99012438631366), (vocabulary levels, 18.51301338282018), (casual language elements, 16.5624353225733), (extra details, 15.819188603202303), (various sentence types, 15.661785186913175), (technical terms, 15.654102124076175), (good grasp, 15.443203746702453), (improved impact, 14.92849242053219), (intricate sentence structures, 14.873945463282618), (clear sentence structures, 14.221129500254266), (moderate complexity, 13.4461460864802), (various subjects, 13.245979169366233), (additional comments, 13.245979169366233), (external sources, 12.224327921834252), (casual mood, 12.11318821952463), (technical terminology, 12.045068526413669), (relaxed feel, 11.432536607650226), (various voice types, 11.320357674348138), (passive voice, 10.983476588414423), (advanced language, 10.185602087806314), (casual tone, 10.124683672836898), (sophisticated language, 10.085078418244539), (consistent tone, 10.085078418244539), (different tenses, 9.791813473415175), (subordinate clauses, 9.747441259410985), (dynamic text flow, 9.66751202240654), (unique writing style, 9.66751202240654), (friendly writing style, 9.66751202240654), (author is easy, 9.66751202240654), (vivid language, 9.45173919959447), (extra information, 9.440672684543532), (writing is engaging, 8.974364841846596), (casual vibe, 8.974364841846596), (extra elements, 8.708612086729506), (casual language instead, 8.56889973373843), (vivid setting, 8.56889973373843), (positive tone, 8.56889973373843), (professional tone, 8.23687187502264), (formal style, 7.934136898031707), (specific references, 7.875752553178485), (detailed descriptions, 7.588070480726704), (better flow, 7.364926929412494), (descriptive language effectively, 7.269616749608169), (diverse techniques, 7.269616749608169), (text is casual, 7.269616749608169), (dynamic rhythm, 7.182605372618539), (different aspects, 7.102562664945003), (various components, 7.028454692791281), (improved rhythm, 6.8949233001667585), (accessible language, 6.8949233001667585), (tense usage, 6.866202593376337), (consistent lengths, 6.834298678350324), (intricate sentence constructions, 6.7230730432401), (various sentence lengths, 6.409415484385058), (relaxed atmosphere, 6.371675156402211), (subject matter, 6.335307512231336), (moderate level, 6.201776119606813), (better readability, 6.1411514977903785), (advanced vocabulary, 6.1411514977903785), (figurative language, 6.08399308395043), (specialized vocabulary, 5.953939955702232), (sophisticated tone, 5.953939955702232), (diverse vocabulary, 5.929842404123172), (specific details, 5.883322388488279), (style is polished, 5.817364420696482), (appropriate language, 5.817364420696482), (better understanding, 5.390845903390485), ...]",154
3,3,"[(simple sentence structures, 529.6793037031227), (various sentence types, 498.93972809737687), (consistent verb tense, 396.17196763430405), (effective sentence structures, 244.36803856828206), (diverse sentence lengths, 192.78184970384902), (better rhythm, 121.1237634982357), (different tenses, 120.76569950545382), (brief sentences, 111.12099637740863), (straightforward sentence structures, 102.28459253854896), (verb tense, 90.90949189658322), (various sentence starters, 90.63427170712556), (diverse sentence structures, 90.25603969924634), (various techniques, 87.03636158012), (clear sentence structures, 82.48255110147474), (descriptive language, 79.34136898031707), (technical language, 78.98289083125009), (simpler options, 71.02562664945003), (complex ones, 63.71675156402211), (different types, 57.71268409097769), (vivid language, 56.710435197566824), (uniform structure, 56.421603316713906), (various tenses, 56.415266097020464), (precise language, 49.51562362012653), (different sentence structures, 49.236729555443645), (formal style, 47.604821388190246), (specialized language, 47.486329958205175), (tense usage, 44.63031685694619), (effective communication, 44.41765348693338), (limited range, 39.10710823436008), (simple language, 38.97377181447839), (different sentence styles, 37.45016736209161), (various sentence structures, 37.10385642350665), (complex sentence structures, 36.83870312919116), (simple language structures, 32.66008903238695), (various items, 31.676537561156678), (better readability, 30.705757488951892), (various sentence beginnings, 30.255235254733616), (overall rhythm, 30.069474656060017), (straightforward sentence constructions, 30.01975188138447), (formal tone, 29.165435955381632), (particular audience, 28.77744508489197), (technical terms, 28.177383823337113), (different ways, 24.015801505107575), (verb forms, 23.38539717813902), (various ways, 22.865073215300452), (limited sentence structures, 22.428276047440484), (basic language structures, 21.547816117855618), (specific type, 21.307687994835007), (casual language, 21.11096077756366), (certain level, 20.684769900500275), (active voice, 20.054206688996068), (verb tenses, 19.55884904409814), (concise sentence structures, 18.9006485772602), (structure is balanced, 17.93589770487781), (straightforward language, 17.516611877752336), (concise sentences, 16.821207035580365), (vocabulary styles, 16.573131888045022), (different sentence lengths, 16.48933831170791), (specialized terms, 16.214496436095672), (technical terminology, 16.060091368551557), (different techniques, 15.932409587150845), (structured writing style, 15.75150510635697), (dynamic language, 15.443203746702453), (prior knowledge, 15.176140961453408), (improved clarity, 15.176140961453408), (smooth transitions, 14.940574890140642), (style is professional, 14.718045649532378), (passive voice, 14.64463545121923), (consistent style, 14.615164430429395), (specific terms, 13.4461460864802), (smooth flow, 13.245979169366233), (various verb tenses, 13.245979169366233), (relaxed atmosphere, 12.743350312804422), (tense verbs, 12.53262928148877), (clear explanations, 11.72169906527244), (relaxed style, 10.570970775465316), (strong language, 9.66751202240654), (positive atmosphere, 9.66751202240654), (personal experiences, 9.66751202240654), (description is casual, 9.66751202240654), (comprehensive details, 9.66751202240654), (standardized naming method, 9.66751202240654), (consistent naming strategy, 9.66751202240654), (smooth transition, 9.66751202240654), (particular resources, 9.66751202240654), (efficient communication, 9.66751202240654), (diverse naming conventions, 9.66751202240654), (overall flow, 9.66751202240654), (concrete objects, 9.66751202240654), (various elements, 9.184676414345427), (clear formatting, 8.974364841846596), (specific area, 8.974364841846596), (different items, 8.974364841846596), (necessary items, 8.974364841846596), (related words, 8.974364841846596), (thorough information, 8.974364841846596), (different sections, 8.974364841846596), (formal tone throughout, 8.974364841846596), (distinct tone, 8.974364841846596), (present ideas, 8.974364841846596), ...]",553
4,4,"[(complex sentence structures, 97.51421416550602), (diverse sentence structures, 91.58333440070585), (formal tone, 75.34404288473588), (style is formal, 71.19193227014294), (extra information, 69.23159968665256), (professional tone, 65.89497500018112), (consistent verb tense, 65.84874666401238), (additional elements, 65.30057460244718), (subordinate clauses, 64.98294172940658), (overall rhythm, 62.286768930410034), (better rhythm, 60.56188174911785), (verb tenses, 58.67654713229442), (tense usage, 58.36272204369887), (better balance, 56.13656061285994), (simple sentence structures, 51.05342686295159), (various sentence structures, 46.86802916653471), (grammatical structures, 45.21137602588858), (diverse sentence lengths, 37.92429830239653), (improved writing, 31.858375782011056), (various sentence types, 31.32357037382635), (technical terms, 31.30820424815235), (specialized language, 30.7264487964857), (passive voice, 29.28927090243846), (vivid language, 28.355217598783412), (various voice types, 28.300894185870344), (literary devices, 27.389286401900574), (specific language, 25.464005219515787), (visual appeal, 24.97341593972317), (limited range, 24.441942646475052), (technical terminology, 24.090137052827338), (descriptive language, 23.802410694095123), (varied sentence lengths, 23.722934187684867), (precise language, 23.107291022725715), (technical language, 22.566540237500025), (active voice, 20.054206688996068), (clear sentence structures, 19.909581300355974), (clear language, 19.55355411718004), (verb tense, 19.138840399280678), (possessive pronouns, 19.115025469206632), (different voice styles, 18.60532835882044), (figurative language, 18.25197925185129), (straightforward sentence structures, 17.788624789312863), (effective communication, 17.767061394773354), (appropriate language, 17.452093262089445), (extra elements, 17.41722417345901), (different things, 17.13779946747686), (different sentence lengths, 16.48933831170791), (certain words, 15.176140961453408), (outside sources, 15.176140961453408), (longer words, 15.176140961453408), (various techniques, 15.136758535673044), (simple language, 14.615164430429395), (various perspectives, 14.205125329890006), (verb forms, 14.031238306883413), (wide range, 13.93089450468079), (descriptive words, 13.789846600333517), (basic language, 13.4461460864802), (smooth reading experience, 13.183537391528379), (different types, 12.367003733780933), (different styles, 12.34200892188012), (dynamic style, 12.11318821952463), (various languages, 12.059851725360309), (different voices, 11.812623813425954), (casual language, 11.728311543090921), (style is polished, 11.634728841392963), (limited sentence structures, 11.214138023720242), (intricate ideas, 11.017257878093737), (better understanding, 10.78169180678097), (informal language, 10.394929515655853), (consistent tone, 10.085078418244539), (style is professional, 9.812030433021585), (consistent style, 9.743442953619597), (various noun forms, 9.66751202240654), (style is often traditional, 9.66751202240654), (format is consistent, 9.66751202240654), (standard notation, 9.66751202240654), (proper citation formats, 9.66751202240654), (essential components, 9.66751202240654), (mythical creatures, 9.66751202240654), (historical references, 9.66751202240654), (impersonal pronouns, 9.66751202240654), (straightforward writing style, 9.66751202240654), (proper grammar rules, 9.66751202240654), (main character, 9.66751202240654), (vivid atmosphere, 9.66751202240654), (direct speech indication, 9.66751202240654), (poetic language, 9.66751202240654), (fictional world, 9.66751202240654), (specific order, 9.66751202240654), (possible outcomes, 9.66751202240654), (strong imagery, 9.66751202240654), (friendly breaks, 9.66751202240654), (style is dense, 9.66751202240654), (rhythm is crucial, 9.66751202240654), (creative writing, 9.66751202240654), (specific point, 9.66751202240654), (varied sentence structures, 9.49506219315683), (various elements, 9.184676414345427), (authentic atmosphere, 8.974364841846596), (simple sentence construction, 8.974364841846596), ...]",230
5,5,"[(clear sentence structures, 688.3026678123065), (overall rhythm, 642.1980658687104), (consistent verb tense, 566.7310163705985), (diverse sentence lengths, 480.37444516368936), (better rhythm, 449.43291192766407), (various sentence types, 434.05518946587944), (various sentence structures, 408.14242065857314), (varied sentence lengths, 396.51189999416135), (better balance, 378.1831451813722), (visual impact, 347.3432579183486), (formal tone, 296.5152655463799), (formal writing style, 263.7643929581414), (diverse sentence structures, 246.87681447146795), (complex language, 235.3991935502881), (precise language, 227.77186865258207), (style is formal, 218.66093482972474), (professional tone, 214.15866875058865), (specific words, 202.30085082100854), (different sentence structures, 182.87928120593355), (concise sentences, 151.39086332022327), (improved impact, 149.2849242053219), (overall quality, 139.30166452453324), (tense usage, 133.89095057083856), (consistent tone, 131.10601943717901), (specialized language, 128.49242223984928), (diverse elements, 110.5407269602268), (smooth reading experience, 109.86281159606983), (different sentence lengths, 107.18069902610142), (various languages, 96.47881380288247), (concise language, 96.14123226577587), (passive voice, 91.52897157012019), (additional information, 91.10161073613769), (simple sentence structures, 90.61983268173907), (formal style, 87.27550587834878), (various sources, 81.3256831741245), (particular style, 80.22305799222079), (communication is important, 78.12818931439503), (effective communication, 75.51001092778675), (different language styles, 68.064199612252), (grammatical structures, 67.81706403883288), (formal language, 62.55797671383638), (various language styles, 56.60178837174069), (technical terminology, 56.21031978993045), (multiple sources, 55.15938640133407), (active voice, 55.14906839473919), (descriptive language, 51.5718898372061), (external quotes, 51.55448850588746), (technical terms, 50.09312679704376), (external sources, 48.89731168733701), (different methods, 48.67194467160344), (neutral tone, 48.26446310116731), (enhanced understanding, 47.84009074845227), (appropriate language, 46.53891536557185), (various elements, 45.92338207172713), (proper nouns, 45.426207344410166), (various techniques, 45.41027560701913), (various punctuation marks, 44.189561576474965), (verb tense, 43.06239089838152), (technical language, 41.37199043541671), (clear communication, 40.66284158706225), (intricate sentence structures, 39.66385456875365), (direct speech, 38.608009366756136), (dynamic style, 36.33956465857389), (specific language, 35.6496073073221), (informal language, 34.649765052186176), (specific style, 34.47461650083379), (specific terms, 33.6153652162005), (cultural context, 33.1248706451466), (intricate ideas, 33.05177363428121), (different types, 32.97867662341582), (better understanding, 32.34507542034291), (diverse languages, 31.50301021271394), (verb tenses, 31.294158470557022), (sophisticated writing style, 30.886407493404906), (dynamic language, 30.886407493404906), (different elements, 28.392639879211327), (more information, 27.337194713401296), (advanced language, 25.464005219515787), (extra information, 25.17512715878275), (succinct sentences, 24.84365298385995), (overall impact, 24.84365298385995), (broad audience, 24.84365298385995), (different styles, 24.68401784376024), (clear language, 24.441942646475052), (specific references, 23.627257659535452), (specific details, 23.533289553953114), (emotional impact, 22.764211442180112), (diverse sources, 22.764211442180112), (various tenses, 21.698179268084793), (sophisticated language, 20.170156836489078), (simple language, 19.486885907239195), (complex ideas, 19.34209683973007), (shortened terms, 19.228246453155176), (varied sentence structures, 18.99012438631366), (concise wording, 17.94872968369319), (particular writing system, 17.94872968369319), (multiple resources, 17.94872968369319), (cultural influences, 17.94872968369319), (diverse perspectives, 17.94872968369319), (ongoing events, 17.94872968369319), ...]",1107
6,6,"[(various tenses, 26.037815121701755), (simple sentence structures, 21.697706416754425), (style is professional, 17.171053257787772), (casual language, 16.41963616032729), (various sentence types, 13.424387303068436), (different tenses, 13.0557512978869), (few clauses, 8.974364841846596), (diverse sentence lengths, 7.900895479665944), (consistent verb tense, 7.556413551607979), (technical language, 7.522180079166675), (relaxed tone, 6.8949233001667585), (clear explanations, 5.86084953263622), (consistent pattern, 5.817364420696482), (various ways, 5.716268303825113), (specialized language, 5.586627053906491), (better understanding, 5.390845903390485), (verb tense, 4.7847100998201695), (descriptive language, 3.9670684490158536), (formal style, 3.9670684490158536), (informal language, 3.4649765052186177), (clear sentence structures, 2.844225900050853)]",18
7,7,"[(overall rhythm, 12.886917709740008), (diverse word structures, 9.66751202240654), (intricate thoughts, 9.66751202240654), (personal bond, 8.974364841846596), (different sentence lengths, 8.244669155853956), (various sentence structures, 7.8113381944224525), (linguistic elements, 7.028454692791281), (relaxed atmosphere, 6.371675156402211), (abstract concepts, 5.838870625917445), (active voice, 5.013551672249017), (improved impact, 4.976164140177397), (vivid language, 4.725869599797235), (descriptive language, 3.9670684490158536), (formal style, 3.9670684490158536), (complex ideas, 3.868419367946014), (visual impact, 3.859369532426096), (passive voice, 3.6611588628048075), (informal language, 3.4649765052186177), (diverse sentence structures, 2.6545894029190102), (complex sentence structures, 2.1669825370112448), (consistent verb tense, 2.1589753004594225)]",8
8,8,"[(better rhythm, 9.56240238143966), (modern advancements, 8.974364841846596), (style is formal, 8.475230032159875), (improved clarity, 7.588070480726704), (additional context, 7.588070480726704), (descriptive language effectively, 7.269616749608169), (precise language, 6.602083149350205), (clear manner, 6.576469569048224), (proper nouns, 6.489458192058595), (style is polished, 5.817364420696482), (various voice types, 5.660178837174069), (different ideas, 5.336778682120209), (rhythmic effect, 5.236695223563227), (complex concepts, 4.05438391601847), (verb tenses, 3.9117698088196278), (simple sentence structures, 3.829007014721369), (technical language, 3.7610900395833373), (additional information, 3.6440644294455073), (subordinate clauses, 3.2491470864703285), (consistent verb tense, 3.238462950689134), (extra information, 3.146890894847844), (specialized language, 2.7933135269532454), (various sentence types, 2.2373978838447393), (complex sentence structures, 2.1669825370112448), (diverse sentence lengths, 1.5801790959331887), (diverse sentence structures, 1.3272947014595051)]",11
9,9,"[(diverse sentence structures, 17.254831118973566), (vivid language, 14.177608799391706), (various sentence structures, 13.669841840239291), (style is formal, 13.560368051455798), (overall rhythm, 12.886917709740008), (formal style, 11.901205347047561), (structure is somewhat complex, 11.634728841392963), (favorable atmosphere, 9.66751202240654), (objective reporting, 9.66751202240654), (present ideas, 8.974364841846596), (extra elements, 8.708612086729506), (visual impact, 7.718739064852192), (style is detailed, 7.588070480726704), (consistent verb tense, 7.556413551607979), (grammatical structures, 7.535229337648097), (various components, 7.028454692791281), (simple sentence structures, 6.381678357868949), (vocabulary levels, 6.17100446094006), (figurative language, 6.08399308395043), (appropriate language, 5.817364420696482), (various sentence beginnings, 5.042539209122269), (active voice, 5.013551672249017), (different language styles, 4.537613307483467), (various tenses, 4.339635853616959), (complex sentence structures, 4.3339650740224895), (verb tenses, 3.9117698088196278), (passive voice, 3.6611588628048075), (different sentence structures, 3.5169092539602604), (informal language, 3.4649765052186177), (varied sentence lengths, 3.3889905982406954), (subordinate clauses, 3.2491470864703285), (extra information, 3.146890894847844), (style is professional, 2.453007608255396), (formal tone, 2.4304529962818027), (casual language, 2.3456623086181843), (various sentence types, 2.2373978838447393), (diverse sentence lengths, 1.5801790959331887)]",20


In [140]:
#{x[0]: sorted(x[1][1].items(), key=lambda x:-x[1])[:10] for x in sd_interp_space.items()}
#sd_clustering.documentID.nunique()

In [195]:
# interp_space = model_clusterings['styledistance'][1]
# {x[0]: sorted(x[1][1].items(), key=lambda x:-x[1])[:10] for x in interp_space.items()}

In [196]:
models_analysis = {}

In [197]:
for model_name, model_interpretablity in model_clusterings.items():
    print(model_name, model_interpretablity[0])
    avg_topic_sim, topi_corr, avg_style_sim, style_corr = analyze_correlation(model_interpretablity[1], model_interpretablity[2], styles_df)
    models_analysis[model_name] = [avg_topic_sim, topi_corr, avg_style_sim, style_corr]

styledistance /mnt/swordfish-pool2/milad/hiatus-data/phase_1/styledistance_explanations/
0.43 0.69
Document 43d2e249-e16e-59e8-b271-4065734f42b0 doesn't exist!
Document f140817d-d77f-57cf-af98-b9e3bdefd608 doesn't exist!
2.05 1.76


ValueError: too many values to unpack (expected 4)

In [None]:
print(tabulate(
        [[name] + scores for name, scores in models_analysis.items()],
        headers=['Embed Model', 'Avg. topic sim', 'Topic corr', 'Avg. style sim', 'Style corr']
    ))

In [13]:
print(tabulate(
        [[name] + scores for name, scores in models_analysis.items()],
        headers=['Embed Model', 'Avg. topic sim', 'Topic corr', 'Avg. style sim', 'Style corr']
    ))

Embed Model      Avg. topic sim    Topic corr    Avg. style sim    Style corr
-------------  ----------------  ------------  ----------------  ------------
styledistance              4.29          0.75              2.05          0.7
ta2_hrs                    5.93          0.81              2.38          0.77
semantic                   9.93          0.93              2.47          0.93


In [113]:
#docs_topic_reps = get_topic_representations(training_df.fullText.tolist())

In [148]:
#docs_style_reps = get_style_representation(styles_path, training_df.documentID.tolist())

In [115]:
#docs_latents, docs_interps = generate_explanations.get_documents_rep_vectors(training_df.fullText.tolist(), 'styledistance', 
#                                                               '/mnt/swordfish-pool2/milad/hiatus-data/phase_1/styledistance_explanations/interpretable_space.pkl')

In [21]:
# interp_space_path = '/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explanation_10k/interpretable_space.pkl'
# model_path = 'aa_model-luar'
# explanation_interf = explanation_interfaces.exp_interface_1
# input_path = '/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/'
# top_c=5