In [9]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
%autoreload
import sys
import copy
sys.path.append('../../style_generation_pipeline/')

import generate_explanations
from gradpyent.gradient import Gradient

In [51]:
import json
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import explanation_interfaces
import glob
from sklearn.preprocessing import minmax_scale

pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

### Load data

In [52]:
def get_data(path_to_input):
    candidates_file = list(glob.glob(path_to_input + '/data/*_candidates.jsonl'))[0]
    queries_file    = list(glob.glob(path_to_input + '/data/*_queries.jsonl'))[0]
    grount_truth_file = list(glob.glob(path_to_input + '/groundtruth/*_groundtruth.npy'))[0]
    q_labels_file = glob.glob(path_to_input + '/groundtruth/*_query-labels.txt')[0]
    c_labels_file = glob.glob(path_to_input + '/groundtruth/*_candidate-labels.txt')[0]
    
    candidates_df = pd.read_json(candidates_file, lines=True)
    queries_df = pd.read_json(queries_file, lines=True)
    
    queries_df['authorID'] = queries_df.authorIDs.apply(lambda x: x[0])
    candidates_df['authorID'] = candidates_df.authorSetIDs.apply(lambda x: x[0])
    
    queries_df = queries_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
    candidates_df = candidates_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
        
    ground_truth_assignment = np.load(open(grount_truth_file, 'rb'))
    candidate_authors = [a[2:-3] for a in  open(c_labels_file).read().split('\n')][:-1]
    query_authors = [a[2:-3] for a in  open(q_labels_file).read().split('\n')][:-1]

    #print(ground_truth_assignment)
    #print(candidate_authors)
    #print(query_authors)
    return query_authors, candidate_authors, queries_df, candidates_df, ground_truth_assignment

def build_explanation_interface1(explanation_interf, candidates_sim_to_query_exp, query_style_reps_summ, candidate_authors):
    explanation_tmp = copy.copy(explanation_interf)
    candidates_sim_to_query_exp = np.array(candidates_sim_to_query_exp)
    for i, s in enumerate(query_style_reps_summ):
        explanation_tmp = explanation_tmp.replace('[style-{}]'.format(i+1), s)

    for order, c_author_id in enumerate(candidate_authors):
            candidate_sim_to_styles = candidates_sim_to_query_exp[order]
            #candidate_sim_to_styles_as_color_str = ['rgb(0, {}, 0)'.format(100 * s) for s in candidate_sim_to_styles]
            for s_id, value in enumerate(candidate_sim_to_styles):
                #explanation_tmp = explanation_tmp.replace('[cand-{}-style-{}]'.format(order+1, s_id+1), color)
                if value > 2:
                    print(value)
                explanation_tmp = explanation_tmp.replace('[cand-{}-style-{}]'.format(order+1, s_id+1), str(round(value, 2)) + '%')
    
    return explanation_tmp

def get_color_gradient(input_list):
    start_color = '#90EE90'
    end_color = '#013220'
    
    # Instantiate the gradient generator, opacity is optional (only used for KML)
    gg = Gradient(gradient_start=start_color, gradient_end=end_color, opacity=1.0)
    return gg.get_gradient_series(series=input_list, fmt='html')

def build_explanation_interface2(explanation_interf, query_author_style_feats, candidate_authors_style_feats, top_k=5):
    explanation_tmp = copy.copy(explanation_interf)
    #top query author feats
    query_feats_ranked = sorted(query_author_style_feats.items(), key=lambda x:-x[1])
    top_query_feats    = query_feats_ranked[:top_k]
    bottom_query_feats = query_feats_ranked[-top_k:]
    #print(top_query_feats)
    #print(bottom_query_feats)
    # The selected features can be also decided differently
    selected_feats = [x[0] for x in top_query_feats] + [x[0] for x in bottom_query_feats]

    #normalize feature weights to scale from 0 to 128
    feature_weights = [[author[f] for author in [query_author_style_feats] + candidate_authors_style_feats] for f in selected_feats]
    feature_weights_color = [get_color_gradient(x) for x in feature_weights]
    
    cell_template = """
        <tr>
            <td style="width: 30%; background-color: rgb(209, 213, 216);">[feat-name]</td>
            <td style="width: 14%; background-color: [author-0-feat-color];"><div style="text-align: center;"><span style="background-color: rgb(239, 239, 239);">[author-0-feat-value]%</span></div></td>
            <td style="width: 14%; background-color: [author-1-feat-color];"><div style="text-align: center;"><span style="background-color: rgb(239, 239, 239);">[author-1-feat-value]%</span></div></td>
            <td style="width: 14%; background-color: [author-2-feat-color];"><div style="text-align: center;"><span style="background-color: rgb(239, 239, 239);">[author-2-feat-value]%</span></div></td>
            <td style="width: 14%; background-color: [author-3-feat-color];"><div style="text-align: center;"><span style="background-color: rgb(239, 239, 239);">[author-3-feat-value]%</span></div></td>
        </tr>
    """
    
    table_cells = []
    for feat_order, feat in enumerate(selected_feats):
        cell_template_tmp = copy.copy(cell_template)
        cell_template_tmp = cell_template_tmp.replace("[feat-name]", feat)
        for author_order, author_feat_weight in enumerate(feature_weights[feat_order]):
            author_feat_weight_color = feature_weights_color[feat_order][author_order]
            cell_template_tmp = cell_template_tmp.replace("[author-{}-feat-color]".format(author_order), author_feat_weight_color)
            cell_template_tmp = cell_template_tmp.replace("[author-{}-feat-value]".format(author_order), str(round(author_feat_weight,2)))
        

        table_cells.append(cell_template_tmp)
    #print(table_cells)
    explanation_tmp = explanation_tmp.replace("[table-body]", "\n".join(table_cells))
    
    return explanation_tmp

In [53]:
def generate_hiatus_explanations(path_to_input, interp_space_path, interp_space_rep_path, style_feat_clm, style_feat_summary_clm, explanation_interf, cluster_lvl=True):
    
    query_authors, candidate_authors, queries_df, candidates_df, ground_truth_assignment = get_data(path_to_input)
    
    output_json = []
    for q_author_id in query_authors:
    
        q_author_documents = queries_df[queries_df.authorID == q_author_id]['fullText'].tolist()[0]
        c_author_documents = candidates_df.fullText.tolist()

        result = generate_explanations.explain_model_prediction_over_author(model_path, interp_space_path, interp_space_rep_path, q_author_documents, c_author_documents, top_c=top_c, style_feat_clm=style_feat_clm, style_feat_summary_clm=style_feat_summary_clm, cluster_lvl=cluster_lvl)

        latent_rank = result[0]
        interp_rank = result[1]
        
        if cluster_lvl == True:
            query_style_reps = result[2]
            query_style_reps_summ = result[3]
            candidates_sim_to_query_exp = result[4]
            query_author_rep_doc_id = result[5]
            candidate_authors_rep_doc_ids = result[6]
        else:
            query_author_style_feats = result[2]
            candidate_authors_style_feats = result[3]
            query_author_rep_doc_id = result[4]
            candidate_authors_rep_doc_ids = result[5]

        #print(latent_rank)
        #print(interp_rank)
        
        instance_json = {
            #"Q_fullText": "\n\n".join(["Document {}: \n{}".format(i,d) for i, d in enumerate(q_author_documents)]),
            "Q_fullText": q_author_documents[query_author_rep_doc_id],
            "Q_authorID": q_author_id,
            "latent_rank" : latent_rank.tolist(),
            "system_rank" : [{"id": x+1, "title": 'Author {}'.format(x+1), "body": ""} for i, x in enumerate(latent_rank.tolist())],
            "rank_1": "1. Author {}".format(latent_rank[0]+1),
            "rank_2": "2. Author {}".format(latent_rank[1]+1),
            "rank_3": "3. Author {}".format(latent_rank[2]+1),
            "interp_rank" : interp_rank.tolist(),
        }

        #print(instance_json['system_rank'])
        #Find candidate authors order
        candidate_author_order = []
        for order, c_author_id in enumerate(candidate_authors):
            #instance_json['a{}_fullText'.format(order)] = "\n\n".join(["Document {}: \n{}".format(i,d) for i, d in enumerate(c_author_documents[order])])
            instance_json['a{}_fullText'.format(order)] = c_author_documents[order][candidate_authors_rep_doc_ids[order]]
            instance_json['a{}_authorID'.format(order)] = c_author_id
            order+=1
            candidate_author_order.append(candidate_authors.index(c_author_id))

        if cluster_lvl:
            instance_json['explanation'] = build_explanation_interface1(explanation_interf, candidates_sim_to_query_exp, query_style_reps_summ, candidate_authors)
        else:
            instance_json['explanation'] = build_explanation_interface2(explanation_interf, query_author_style_feats, candidate_authors_style_feats)

        #Find the ground-truth labels (who wrote the query document)
        query_author_idx = query_authors.index(q_author_id)
        candidate_atuthor_labels = [ground_truth_assignment[query_author_idx][a_idx] for a_idx in candidate_author_order]
        gt_idx = candidate_atuthor_labels.index(1)
        instance_json["gt_idx"] = gt_idx
        
        output_json.append(instance_json)

    return output_json[0] # it is always one query author per instance

In [35]:
#queries.to_json('/mnt/swordfish-pool2/milad/hiatus-data/test-samples-for-explanations/hrs_06-27-24_english_explainability-example_TA2_input_queries.json')
#candidates.to_json('/mnt/swordfish-pool2/milad/hiatus-data/test-samples-for-explanations/hrs_06-27-24_english_explainability-example_TA2_input_candidates.json')

## Generate documents style explanations:

In [36]:
model_path = 'aa_model-luar'
input_path = '/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/'
top_c=3

In [49]:
interp_space_path = '/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interpretable_space.pkl'
interp_space_rep_path = '/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/interpretable_space_representations.json'

output = []
idx=0
for data_point in glob.glob(input_path + '*/'):
    print(data_point)

    # generate explanations on feature-level
    instance_exp = generate_hiatus_explanations(data_point, interp_space_path, interp_space_rep_path, 'llm_tfidf_weights', None, explanation_interfaces.exp_interface_2, cluster_lvl=False)
    instance_exp['source'] = 'feature_lvl_explanation_tfidf'
    
    # Create an instance with no explanation
    instance_wo_exp = {x[0]: x[1] for x in instance_exp.items()}
    instance_wo_exp['explanation'] = '<p>No Explanation</p>'
    instance_wo_exp['source'] = 'no_explanation'
    
    output.append(instance_wo_exp)
    output.append(instance_exp)

    # generate explanations on cluster level
    for feat_clm in ['llm_tfidf_rep', 'g2v_tfidf_rep']:
        instance_exp = generate_hiatus_explanations(data_point, interp_space_path, interp_space_rep_path, feat_clm, None, explanation_interfaces.exp_interface_1_1)
        instance_exp['source'] = feat_clm
        output.append(instance_exp)
    
    open(data_point + '/explanation.html', 'w').write(instance_exp['explanation'])
    idx+=1
    break
json.dump(output, open('/mnt/swordfish-pool2/milad/hiatus-data/explainability_all_data/explanations.json', 'w'))

/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/samples_29_query_80_first/
# clusters: 43


IndexError: list index out of range

In [54]:
query_authors, candidate_authors, queries_df, candidates_df, ground_truth_assignment = get_data('/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/samples_9_query_20_first/')
q_author_documents = queries_df[queries_df.authorID == query_authors[0]]['fullText'].tolist()[0]
c_author_documents = candidates_df.fullText.tolist()
res = generate_explanations.explain_model_prediction_over_author(model_path, interp_space_path, interp_space_rep_path, 
                                                                 q_author_documents, c_author_documents, top_c=3, 
                                                                 style_feat_clm='llm_tfidf_rep', style_feat_summary_clm=None, 
                                                                 cluster_lvl=True)

# clusters: 43


In [67]:
res

(array([1, 0, 2]),
 array([1, 0, 2]),
 [['the author maintains a neutral tone in their work',
   'simple sentence structure is used',
   'the use of connectors and transitional phrases is limited',
   'the author uses a limited range of verb forms',
   'the author uses a limited range of verb forms to highlight simplicity and compound structures'],
  ['the author uses complex concepts to explore deep ideas and themes',
   'the author maintains a neutral tone in their work',
   'the author uses a limited range of verb forms to highlight simplicity and compound structures',
   'the author uses a formal writing style with advanced language and intricate sentence structures',
   'the author provides clear explanations of technical terms'],
  ['the author provides clear explanations of technical terms',
   'the author maintains a neutral tone in their work',
   'the author avoids complex sentence structures',
   'the author maintains a professional tone in their work',
   'the author uses a

In [63]:
query_docs = res[-2]
candid_docs = res[-1]

In [73]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([np.mean(query_docs, axis=0)], query_docs))
print([cosine_similarity([c_author[0]], c_author[1]) for c_author in zip([np.mean(x, axis=0) for x in candid_docs], candid_docs)])

[[0.9639465 0.9605864]]
[array([[1.0000001]], dtype=float32), array([[0.9441627 , 0.967767  , 0.95038724, 0.95574105]], dtype=float32), array([[0.95748687, 0.9601768 , 0.9637917 ]], dtype=float32)]


## Analyze documents style description

In [199]:
interp_space_path = '/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/interpretable_space.pkl'
model_path = 'aa_model-luar'
explanation_interf = explanation_interfaces.exp_interface_1
input_path = '/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/'
top_c=3

In [167]:
path_to_input = '/mnt/swordfish-pool2/milad/hiatus-data/explainability-pilot-samples/samples_26_query_79_first/'

In [168]:
candidates_file = list(glob.glob(path_to_input + '/data/*_candidates.jsonl'))[0]
queries_file    = list(glob.glob(path_to_input + '/data/*_queries.jsonl'))[0]
grount_truth_file = list(glob.glob(path_to_input + '/groundtruth/*_groundtruth.npy'))[0]
q_labels_file = glob.glob(path_to_input + '/groundtruth/*_query-labels.txt')[0]
c_labels_file = glob.glob(path_to_input + '/groundtruth/*_candidate-labels.txt')[0]

candidates_df = pd.read_json(candidates_file, lines=True)
queries_df = pd.read_json(queries_file, lines=True)

queries_df['authorID'] = queries_df.authorIDs.apply(lambda x: x[0])
candidates_df['authorID'] = candidates_df.authorSetIDs.apply(lambda x: x[0])

queries_df = queries_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
candidates_df = candidates_df.groupby('authorID').agg({'fullText': lambda x: list(x)}).reset_index()
    
ground_truth_assignment = np.load(open(grount_truth_file, 'rb'))
candidate_authors = [a[2:-3] for a in  open(c_labels_file).read().split('\n')][:-1]
query_authors = [a[2:-3] for a in  open(q_labels_file).read().split('\n')][:-1]

#print(ground_truth_assignment)
#print(candidate_authors)
#print(query_authors)

In [7]:
query_authors

['9265b1f9-31f5-58e5-bfb7-c5979de68d47']

In [8]:
candidate_authors

['07a9a4f0-faed-9b92-fd68-8f47cdc969d5',
 'a83b5d4d-0e79-7ee6-f61b-29b0b059f220',
 'd53e9f35-4c07-3b87-43fc-d52e5f127c6c']

In [9]:
candidate_authors[-1]

'd53e9f35-4c07-3b87-43fc-d52e5f127c6c'

In [10]:
ground_truth_assignment

array([[1, 0, 0]])

In [11]:
matched_candidate_docs = candidates_df[candidates_df.authorID == '07a9a4f0-faed-9b92-fd68-8f47cdc969d5'].fullText.tolist()[0]
other_candidate_docs = candidates_df[candidates_df.authorID != '07a9a4f0-faed-9b92-fd68-8f47cdc969d5'].fullText.tolist()[0] #just pick one other candidate author
query_documents = queries_df.fullText.tolist()[0]

In [12]:
matched_candidate_docs

['The elves had a very distinguished and strict military system, and it included a complex training system that ensured the continued supply of experienced fighters to the army, and this system was based on the soldiers who survived the battles training the new soldiers, as the military system stipulated that the fighters who spent more than 30 years in the war should go to their homeland to train new soldiers in the use of weapons, where the soldier had to prove his competence and strength in order to have the honor of training soldiers.\n\n  But because of the many wars caused by <PERSON> in the first age, and the wars caused by <PERSON> in the second and third ages, where the battles lasted for months, and the siege of castles lasted for years, the elves suffered great losses in their lives, but these losses It was concentrated in the ranks of the swords and spearmen who were always forced to physically interact with the enemy forces, which exposed them to great and permanent danger

In [13]:
other_candidate_docs

["Burning wood has been a local source of warmth for ages, however, while it may provide warmth, it is important to know that burning wood can produce harmful gases such as benzene, and other gases such as carbon monoxide and nitrogen oxides. When organic matter burns, large polycyclic aromatic compounds are released due to combustion. This occurs not only with wood, but also with any other organic matter. Understanding why benzene form in a wood fire, will require us to know what wood is, and how it burns.\n\n  Wood is made up of biomass (matter composed primarily of carbon, hydrogen, and oxygen). This includes things like dry grass, leaves, fibers, animal carcasses, sugar, fat, and coal etc. All of these things contain cellulose or similar molecules, which consists of long chains of aromatic rings.\n\n  When wood is heated up, the molecules composing it start to break apart, in the presence of oxygen, the wood will start to oxidize, this process is what we know as combustion. If ther

In [14]:
query_documents

['The Sahara Desert extends over an area of 9,200,000 square kilometers and is the largest desert in the world. It is one of the driest regions in the world with an average of 25 mm of rain per year. In the past, it was a green area full of rivers, 3 million years ago, until climate change, and the occurrence of several major earthquakes led to the formation of new mountains that blocked the moisture of the sea, and changed the course of the rivers in them, all of this led to the cutting off of water resources and turning it into a dry area with high temperatures, unfit for most types of life.\n\n  Climate change is expected to continue to cause bad consequences for the Sahara desert, as studies indicate that rainfall is expected to decrease by 20 to 30 percent in the next 40 years, and this will lead to more desertification and the destruction of more green spaces on The borders of deserts in general, and will lead to an increase in the borders of the Sahara desert by no less than 7 p

In [15]:
q_documents_reps , q_documents_clusters = generate_explanations.get_documents_style_descriptions(query_documents, model_path, interp_space_path, top_c=3, top_k=5)

# clusters: 14


In [16]:
c_documents_reps , c_documents_clusters = generate_explanations.get_documents_style_descriptions(matched_candidate_docs, model_path, interp_space_path, top_c=3, top_k=5)

# clusters: 14


In [17]:
o_documents_reps , o_documents_clusters = generate_explanations.get_documents_style_descriptions(other_candidate_docs, model_path, interp_space_path, top_c=3, top_k=5)

# clusters: 14


In [18]:
q_documents_clusters

[[(13, 0.6829973459243774), (5, 0.5885012149810791), (3, 0.5295000076293945)],
 [(13, 0.42635929584503174),
  (3, 0.42302098870277405),
  (5, 0.3870851993560791)],
 [(13, 0.7052289247512817), (5, 0.6760463714599609), (12, 0.5887666344642639)],
 [(3, 0.4849829375743866), (13, 0.4269305467605591), (5, 0.3840888440608978)]]

In [19]:
c_documents_clusters

[[(5, 0.3803691864013672),
  (13, 0.28969496488571167),
  (7, 0.25092387199401855)]]

In [20]:
o_documents_clusters

[[(3, 0.9635261297225952), (13, 0.8992846608161926), (12, 0.7395482063293457)]]

In [21]:
q_documents_reps

[[(13,
   ('The approach used by the author is systematic, starting with a general statement and then exploring more details.',
    19.92006319968675)),
  (13,
   ('Revised: The author prefers a diverse writing style with structured sentences.',
    13.515608204437266)),
  (13,
   ('The author expresses strong emotions and perspectives through the use of impactful language.',
    13.069321101808846)),
  (13,
   ('The writer employs diverse sentence structures for improved clarity.',
    12.39637662856642)),
  (13,
   ('The author effectively conveys emotions through descriptive language.',
    11.893747772004609)),
  (5,
   ('The writer uses various voices to create different effects.',
    18.470951754889448)),
  (5,
   ('The use of figurative language conveys complex ideas.',
    13.069321101808846)),
  (5,
   ('Passive voice is utilized in various situations.', 11.683026740688955)),
  (5,
   ('Rewritten: The author utilizes different sentence lengths for conveying diverse emotions a

In [22]:
c_documents_reps

[[(5,
   ('The writer uses various voices to create different effects.',
    18.470951754889448)),
  (5,
   ('The use of figurative language conveys complex ideas.',
    13.069321101808846)),
  (5,
   ('Passive voice is utilized in various situations.', 11.683026740688955)),
  (5,
   ('Rewritten: The author utilizes different sentence lengths for conveying diverse emotions and ideas.',
    11.683026740688955)),
  (5,
   ('The author creates a connection with the reader through direct engagement.',
    10.806516878826645)),
  (13,
   ('The approach used by the author is systematic, starting with a general statement and then exploring more details.',
    19.92006319968675)),
  (13,
   ('Revised: The author prefers a diverse writing style with structured sentences.',
    13.515608204437266)),
  (13,
   ('The author expresses strong emotions and perspectives through the use of impactful language.',
    13.069321101808846)),
  (13,
   ('The writer employs diverse sentence structures for imp

In [217]:
o_documents_reps

[[(1, ('limited range', 90.29001541922472)),
  (1, ('declarative sentences', 87.53142177173797)),
  (1, ('simple sentence structures', 84.48013863066261)),
  (1, ('present tense', 82.12878113649731)),
  (1, ('consistent verb tense', 60.12597809047695)),
  (4, ('present tense', 9.662209545470272)),
  (4, ('short sentences', 9.107428308901413)),
  (4, ('simple sentence structures', 8.960014703252094)),
  (4, ('declarative sentences', 8.142457839231438)),
  (4, ('complex verb forms', 6.77546121578783)),
  (10, ('present tense', 7.246657159102704)),
  (10, ('declarative sentences', 6.106843379423578)),
  (10, ('limited range', 5.209039351109118)),
  (10, ('occasional switches', 4.655197679587739)),
  (10, ('simple sentence structures', 3.840006301393755))]]

In [None]:
interpretable_space = pkl.load(open(interp_space_path, 'rb'))
    
del interpretable_space[-1] #DBSCAN generate a cluster -1 of all outliers. We don't want this cluster
print("# clusters:", len(interpretable_space))
dimension_to_latent = {key: interpretable_space[key][0] for key in interpretable_space}
dimension_to_style  = {key: interpretable_space[key][1] for key in interpretable_space}