In [2]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:

import pandas as pd
import json
import sklearn
import glob
import pickle
from sklearn.model_selection import train_test_split
from collections import Counter


pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [52]:
%autoreload
import sys
sys.path.insert(0, '../../style_generation_pipeline')

from data import *
from cluster_representation import *

In [53]:
path='/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/'

### Merging data from phase 1 and 2 that was used to generate style features

In [96]:
phase1_docs = pd.read_json(path_or_buf='/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/training_candidates_and_queries.jsonl', lines=True)
phase2_docs = pd.read_json(path_or_buf='/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/all_documents_in_cross_genre.jsonl', lines=True)

In [97]:
all_docs = pd.concat([phase1_docs, phase2_docs]).reset_index()

In [101]:
all_docs[['authorID', 'fullText', 'documentID']].to_json(path + '/all_document.jsonl', orient='records', lines=True)

### Merging data from phase 1 and 2 to perform clustering

In [10]:
phase2_training_data = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/train_authors.json')
phase2_test_data     = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/test_authors.json')

phase1_training_data = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/phase_1/training_authors.json')
phase1_test_data     = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/phase_1/valid_authors.json')

In [17]:
training_authors = pd.concat([phase1_training_data, phase2_training_data]).reset_index()
test_authors = pd.concat([phase1_test_data, phase2_test_data]).reset_index()

In [18]:
training_authors.columns

Index(['level_0', 'index', 'authorID', 'fullText', 'documentID', 'source'], dtype='object')

In [19]:
print(phase1_training_data.authorID.nunique(), phase2_training_data.authorID.nunique(), training_authors.authorID.nunique())
print(phase1_test_data.authorID.nunique(), phase2_test_data.authorID.nunique(), test_authors.authorID.nunique())

4142 1216 5358
635 305 940


In [20]:
training_authors.to_json(path+'/train_authors.json')
test_authors.to_json(path+'/test_authors.json')

### Merge the writing style features of both phases

In [12]:
phase1_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/filtered/refined_and_aggregated_features_final.csv')
feat_to_ling_lvl = json.load(open('/mnt/swordfish-pool2/milad/hiatus-data/phase_1/explainability/feats_to_ling_lvl.json'))
phase1_df['ling_lvl'] = phase1_df.original_attribute_name.apply(lambda x: feat_to_ling_lvl[x] if x in feat_to_ling_lvl else 'other')

phase2_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/filtered/refined_and_aggregated_features_final.csv')
feat_to_ling_lvl = json.load(open('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/feats_to_ling_lvl.json'))
phase2_df['ling_lvl'] = phase2_df.original_attribute_name.apply(lambda x: feat_to_ling_lvl[x] if x in feat_to_ling_lvl else 'other')

In [23]:
df = pd.concat([phase1_df, phase2_df]).reset_index()

In [26]:
print(df.documentID.nunique())
print(df['shortend_attribute_name.v2'].nunique(), df.aggregated_name.nunique(), df.final_attribute_name.nunique())
print(df.ling_lvl.nunique())

22706
9314 9049 2120
4


In [34]:
df.to_csv(path + '/refined_and_aggregated_features_final.csv', index=False)

In [36]:
df.groupby(['final_attribute_name', 'aggregated_name']).agg({'documentID': lambda x: len(x), 'ling_lvl': lambda x: list(x)[0]}).reset_index().to_csv(path + '/llm_generated_style_feats.csv')

In [28]:
g_df = df.groupby('final_attribute_name').agg({'original_attribute_name': lambda feats: {f: feats.tolist().count(f) for f in set(feats)},
                                               'shortend_attribute_name.v1': lambda feats: {f: feats.tolist().count(f) for f in set(feats)},
                                               'shortend_attribute_name.v2': lambda feats: {f: feats.tolist().count(f) for f in set(feats)},
                                               'aggregated_name': lambda feats: {f: feats.tolist().count(f) for f in set(feats)},
                                               'documentID': lambda x: len(x),
                                               'ling_lvl': lambda x: list(x)
                                    }).reset_index()

g_df['final_attribute_ling_lvl'] = g_df['ling_lvl'].apply(lambda x: Counter(x).most_common(1)[0][0])

In [30]:
g_df.to_json(path + '/style_features_corpus.json', orient='records', indent=2)

In [31]:
print(df.aggregated_name.nunique())
print(df.final_attribute_name.nunique())
g_df.final_attribute_ling_lvl.value_counts()

9049
2120


Semantic Level         673
Discourse Level        615
Syntactic Level        490
Morphological Level    342
Name: final_attribute_ling_lvl, dtype: int64

#### Loading the manually processed feats:

In [81]:
llm_based_df = pd.read_csv(path + '/refined_and_aggregated_features_final.csv')

In [82]:
cleaned_up_feats = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/manuall-processed-feats-nov-23-24.tsv', sep='\t')
cleaned_up_feats = cleaned_up_feats[cleaned_up_feats.documentID != 'documentID']
grouped_cleaned_up_feats_df = cleaned_up_feats.groupby('full_feature_processed').agg({
    'documentID': lambda x:sum([int(i) for i in x]),
    'full_feature_name': lambda x: list(x)
}).reset_index()

In [83]:
filtered_feats =  grouped_cleaned_up_feats_df[(grouped_cleaned_up_feats_df.documentID > 50)]
filtered_feats_map = {f: item[0] for item in zip(filtered_feats.full_feature_processed.tolist(), filtered_feats.full_feature_name.tolist()) for f in item[1]}

In [84]:
filtered_feats.sort_values('documentID', ascending=False)[['full_feature_processed', 'documentID']].head(n=5)

Unnamed: 0,full_feature_processed,documentID
2793,the author uses diverse sentence structures,11550
79,complex sentence structures are used,7066
3008,the author uses simple sentence structures,5880
232,specialized language is used,5337
214,sentence structures are varied,4740


In [85]:
llm_based_df['final_attribute_name_manually_processed'] = llm_based_df.aggregated_name.apply(lambda x: filtered_feats_map[x] if x in filtered_feats_map else '')
llm_based_df = llm_based_df[llm_based_df['final_attribute_name_manually_processed'] != '']

In [86]:
print('features: ', llm_based_df['final_attribute_name_manually_processed'].nunique())
print('documents: ',llm_based_df['documentID'].nunique())

features:  332
documents:  22520


In [87]:
llm_based_df.to_csv(path + '/refined_and_aggregated_features_final_manually_processed.csv', index=False)

#### Merge the Gram2vec style feats to our LLM-based style feats:

In [88]:
llm_based_df = pd.read_csv(path + '/refined_and_aggregated_features_final_manually_processed.csv')
llm_based_df = llm_based_df.groupby('documentID').agg({'final_attribute_name_manually_processed': lambda x: list(x)}).reset_index()

In [89]:
gram2vec_df   = pd.read_json(path+ '/normalized_all_document_gram2vec_top_features.jsonl', lines=True)
gram2vec_dict = {x[0]: x[1] for x in zip(gram2vec_df.documentID.tolist(), gram2vec_df.gram2vec_feats.tolist())} 
gram2vec_feats = set([x for feats in gram2vec_dict.values() for x in feats])

In [90]:
llm_based_df['final_attribute_name_manually_processed'] = llm_based_df.apply(lambda x: x['final_attribute_name_manually_processed'] + gram2vec_dict[x['documentID']], axis=1)

In [91]:
llm_based_df = llm_based_df.explode(['final_attribute_name_manually_processed'])
gram2vec_df  = gram2vec_df.explode(['gram2vec_feats'])

In [92]:
llm_based_df.to_csv(path + '/llm_and_gram2vec_feats.csv', index=False)
gram2vec_df.to_csv(path+ '/gram2vec_feats.csv', index=False)

### Consturcted Interpretable space:

In [None]:
! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/train_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/test_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/" \
--model aa_model-luar \
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/refined_and_aggregated_features_final.csv"

Testing Different Epsilon Values:   0%|                  | 0/99 [00:00<?, ?it/s][(0.5, 0.001, 0.151, 0.998), 1, 0, 0] 0.01
Testing Different Epsilon Values:   1%|1         | 1/99 [00:28<47:20, 28.99s/it][(0.284, 0.018, 0.191, 0.68), 3, 0.9848099, 0] 0.02
Testing Different Epsilon Values:   2%|2         | 2/99 [01:00<49:27, 30.59s/it][(0.216, 0.065, 0.256, 0.643), 8, 0.9179425, 0] 0.03
Testing Different Epsilon Values:   3%|3         | 3/99 [01:32<49:57, 31.23s/it][(0.182, 0.114, 0.311, 0.64), 21, 0.6840293, 0] 0.04
Testing Different Epsilon Values:   4%|4         | 4/99 [02:04<49:47, 31.44s/it][(0.183, 0.123, 0.322, 0.648), 26, 0.5772858, 0] 0.05
Testing Different Epsilon Values:   5%|5         | 5/99 [02:36<49:22, 31.52s/it][(0.173, 0.139, 0.339, 0.645), 38, 0.5467682, 0] 0.06
Testing Different Epsilon Values:   6%|6         | 6/99 [03:07<48:54, 31.56s/it][(0.142, 0.163, 0.366, 0.639), 44, 0.5083162, 0] 0.07
Testing Different Epsilon Values:   7%|7         | 7/99 [03:39<48:27, 31.60s/

In [8]:
# ! CUDA_VISIBLE_DEVICES=6 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/train_authors.json" \
# --test-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/test_authors.json" \
# --save-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/" \
# --model aa_model-luar \
# --style-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/refined_and_aggregated_features_final.csv" \
# --style_feat_column 'final_attribute_name' \
# --top_k_feats 10\
# --summarize_cluster_reps\
# --eps 0.07

! yes| CUDA_VISIBLE_DEVICES=0 python ../cluster_documents.py --train-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/train_authors.json" \
--test-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/test_authors.json" \
--save-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interp_space_126_clusters/" \
--model  'aa_model-luar'\
--style-dir "/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/refined_and_aggregated_features_final.csv" \
--style_feat_column 'final_attribute_name' \
--top_k_feats 10\
--eps 0.14
#--summarize_cluster_reps\

^C
Traceback (most recent call last):
  File "/home/ma4608/style_generation_pipeline/src-ipynb/../cluster_documents.py", line 250, in <module>
    main(args)
  File "/home/ma4608/style_generation_pipeline/src-ipynb/../cluster_documents.py", line 59, in main
    author_to_embeddings = {
                           ^
  File "/home/ma4608/style_generation_pipeline/src-ipynb/../cluster_documents.py", line 60, in <dictcomp>
    row["authorID"]: np.mean(model.encode(row["fullText"]), axis=0)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ma4608/style_generation_pipeline/aa_models/luar_pausit/__init__.py", line 22, in encode
    return self.model.encode(
           ^^^^^^^^^^^^^^^^^^
  File "/mnt/swordfish-pool2/milad/conda-envs/gpu-env/lib/python3.11/site-packages/sentence_transformers/SentenceTransformer.py", line 373, in encode
    embeddings = embeddings.cpu()
                 ^^^^^^^^^^^^^^^^
KeyboardInterrupt


### Build Clusters Style Representation

In [93]:
llm_style_feats = path + '/refined_and_aggregated_features_final_manually_processed.csv'
llm_and_g2v_style_feats = path + '/llm_and_gram2vec_feats.csv'
g2v_style_feats = path + '/gram2vec_feats.csv'

In [94]:
# df = pd.read_csv(llm_style_feats)
# df[['shortend_attribute_name.v2', 'original_attribute_name']].drop_duplicates().to_csv('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/original_feature_name.csv')

In [96]:
feat_clm = 'final_attribute_name_manually_processed'
styles_corpus_path = llm_style_feats
#Representative Summarization
clusters_tfidf_rep_df = generate_interpretable_space_representation('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interp_space_126_clusters/train_authors.pkl', styles_corpus_path, feat_clm, 'tfidf_rep', num_feats=10, summarize_with_gpt=False)
clusters_contra_rep_df = generate_interpretable_space_contra_representation('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interp_space_126_clusters/train_authors.pkl', styles_corpus_path, feat_clm, 'con_rep', num_feats=10, summarize_with_gpt=False)

*

In [97]:
def build_cluster_representation(clustering_path, output_path, top_k=10):
    feat_clm = 'final_attribute_name_manually_processed'
    styles_corpus_path = llm_style_feats
    #Representative Summarization
    clusters_tfidf_rep_df = generate_interpretable_space_representation(clustering_path, styles_corpus_path, feat_clm, 'tfidf_rep', num_feats=top_k, summarize_with_gpt=False)
    #Contrastive Summarization
    clusters_contra_rep_df = generate_interpretable_space_contra_representation(clustering_path, styles_corpus_path, feat_clm, 'con_rep', num_feats=top_k, summarize_with_gpt=False)

    feat_clm = 'gram2vec_feats'
    styles_corpus_path = g2v_style_feats
    #Representative Summarization
    clusters_tfidf_rep_g2v_df = generate_interpretable_space_representation(clustering_path, styles_corpus_path, feat_clm, 'tfidf_rep', num_feats=top_k, summarize_with_gpt=False)
    #Contrastive Summarization
    clusters_contra_rep_g2v_df = generate_interpretable_space_contra_representation(clustering_path, styles_corpus_path, feat_clm, 'con_rep', num_feats=top_k, summarize_with_gpt=False)

    clusters_contra_rep_df['llm_con_rep'] = clusters_contra_rep_df['con_rep']
    clusters_contra_rep_df['llm_tfidf_rep'] = clusters_tfidf_rep_df['tfidf_rep']
    clusters_contra_rep_df['g2v_tfidf_rep'] = clusters_tfidf_rep_g2v_df['tfidf_rep']
    clusters_contra_rep_df['g2v_con_rep'] = clusters_contra_rep_g2v_df['con_rep']

    clusters_contra_rep_df[['cluster_label', 'llm_con_rep', 'llm_tfidf_rep', 'g2v_con_rep', 'g2v_tfidf_rep']].to_json(output_path)

In [98]:
build_cluster_representation('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/train_authors.pkl', 
                             '/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interpretable_space_representations.json', top_k=10)

**

In [9]:
build_cluster_representation('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interp_space_126_clusters/train_authors.pkl', 
                             '/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interp_space_126_clusters/interpretable_space_representations.json', top_k=10)

**

In [147]:
# Converting gra2vec style corpus from jsonl to csv
# g2v_style_corpus = pd.read_json('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/normalized_all_document_gram2vec_top_features.jsonl', lines=True)
# g2v_style_corpus = g2v_style_corpus.explode('gram2vec_feats')
# g2v_style_corpus.to_csv('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/normalized_all_document_gram2vec_top_features.csv', index=False)