In [1]:
import sys
sys.path.insert(0, '../../style_generation_pipeline')

In [2]:
import pandas as pd
import json
import sklearn
import glob
import pickle
from sklearn.model_selection import train_test_split

pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
from data import *

In [4]:
path='/mnt/swordfish-pool2/milad/hiatus-data/phase_2'

### Compuate correlation:

In [45]:
llm_feats_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/filtered/refined_and_aggregated_features_final.csv')
gram2vec_feats_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/gram2vec_feats.csv')

In [46]:
total_num_documents = llm_feats_df.documentID.nunique()

In [49]:
## Compute co-occurancies between pairs of feats
llm_feats_df = llm_feats_df.groupby('final_attribute_name').agg({'documentID': lambda x: list(x)}).reset_index()
gram2vec_feats_df = gram2vec_feats_df.groupby('gram2vec_feats').agg({'documentID': lambda x: list(x)}).reset_index()

In [50]:
llm_feats_dict = {x[0]: x[1] for x in zip(llm_feats_df.final_attribute_name.tolist(), llm_feats_df.documentID.tolist())}
g2v_feats_dict = {x[0]: x[1] for x in zip(gram2vec_feats_df.gram2vec_feats.tolist(), gram2vec_feats_df.documentID.tolist())}

In [63]:
occ_matrix = np.zeros(shape=(len(g2v_feats_dict.keys()), len(llm_feats_dict.keys())))

In [64]:
for i, g2v_feat in enumerate(g2v_feats_dict.keys()):
    for j, llm_feat in enumerate(llm_feats_dict.keys()):
        g2v_docs, llm_docs = g2v_feats_dict[g2v_feat], llm_feats_dict[llm_feat]
        prop_g2v  = len(g2v_docs)/total_num_documents
        prop_llm  = len(llm_docs)/total_num_documents
        prop_both = len(set(g2v_docs).intersection(set(llm_docs)))/total_num_documents
        occ_matrix[i,j] = prop_both/prop_llm

In [67]:
occ_matrix_df = pd.DataFrame(occ_matrix, columns=llm_feats_dict.keys())
occ_matrix_df['g2v_feat'] = g2v_feats_dict.keys()

In [69]:
occ_matrix_df.to_csv(path + '/g2v_and_llm_occ_matrix.csv', index=False)

### Analysing the interp spaces:

In [90]:
#Load gram2vec and llm-based features
llm_feats_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/filtered/refined_and_aggregated_features_final.csv')
gram2vec_feats_df = pd.read_csv('/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/gram2vec_feats.csv')
llm_feats = llm_feats_df.final_attribute_name.unique().tolist()
gram2vec_feats = gram2vec_feats_df.gram2vec_feats.unique().tolist()

In [91]:
#load the interpretable spaces
interp_spaces = {
    'gram2vec': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/gram2vec'],
    'llm': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/llm/'],
    'both': ['/mnt/swordfish-pool2/milad/hiatus-data/phase_2/explainability/llm_and_gram2vec'],
}

for key, val in interp_spaces.items():
    # style-distance clustering
    interp_space = pd.read_pickle(val[0] + "/interpretable_space.pkl")
    clustering_df  = pd.read_pickle(val[0] + "/train_authors_documents.pkl")
    # DBSCAN generate a cluster -1 of all outliers. We don't want this cluster
    del interp_space[-1]
    clustering_df = clustering_df[clustering_df.cluster_label != -1]

    cluster_documents_df = clustering_df.groupby('cluster_label').agg({'documentID': lambda x: len(list(x))}).reset_index()
    interp_space_df = pd.DataFrame([{'cluster_label': x[0], 'style_feats': sorted(x[1][1].items(), key=lambda x:-x[1])} for x in interp_space.items()])
    interp_space_df['num_feats'] = interp_space_df.style_feats.apply(lambda x: len(x))
    interp_space_df['num_documents'] = interp_space_df.cluster_label.apply(lambda x: cluster_documents_df[cluster_documents_df.cluster_label==x]['documentID'].tolist()[0])

    interp_spaces[key].append(interp_space)
    interp_spaces[key].append(clustering_df)
    interp_spaces[key].append(interp_space_df)

In [92]:
all_in_one_interp_space = {}
for c_label in interp_spaces['llm'][-1].cluster_label.tolist():
    llm_feats  = interp_spaces['llm'][-1][interp_spaces['llm'][-1].cluster_label == c_label]['style_feats'].tolist()[0]
    g2v_feats  = interp_spaces['gram2vec'][-1][interp_spaces['gram2vec'][-1].cluster_label == c_label]['style_feats'].tolist()[0]
    both_feats = interp_spaces['both'][-1][interp_spaces['both'][-1].cluster_label == c_label]['style_feats'].tolist()[0]
    all_in_one_interp_space[c_label] = {
        'ranked-llm-feats'  : [x[0] for x in llm_feats],
        'ranked-g2v-feats'  : [x[0] for x in g2v_feats],
        'ranked-both-feats' : [x[0] for x in both_feats]
    }
json.dump(all_in_one_interp_space, open(path + '/explainability/interp_space_style_representations.json', 'w'))

In [93]:
#find the source of top features in the representation
g2v_appearing = 0
llm_appearing = 0
top_k=10
for c_label, rep in all_in_one_interp_space.items():
    top_k_feats = rep['ranked-both-feats'][:top_k]
    print(top_k_feats)
    top_k_feats_sources = ['g2v' if f in gram2vec_feats else 'llm' for f in top_k_feats]
    print(top_k_feats_sources)
    g2v_appearing+= top_k_feats_sources.count('g2v')
    llm_appearing+= top_k_feats_sources.count('llm')

print('llm:', llm_appearing/(10*len(all_in_one_interp_space.keys())), 'g2v:', g2v_appearing/(10*len(all_in_one_interp_space.keys())))

['Part-of-Speech Unigram:Other (foreign words, typos, abbreviations)', 'Part-of-Speech Unigram:Proper noun', 'Part-of-Speech Bigram:Proper noun followed by Other', 'tense usage', 'style is formal', 'precise language', 'Morphology Tag:Comma punctuation type', 'more information', 'additional clauses', 'Dependency Label:Object of preposition']
['g2v', 'g2v', 'g2v', 'llm', 'llm', 'llm', 'g2v', 'llm', 'llm', 'g2v']
['passive voice', 'overall rhythm', 'complex ideas', 'cohesive narrative', 'vivid language', 'better engagement', 'Dependency Label:Object of preposition', 'rhetorical devices', 'dynamic narrative flow', 'conceptual ideas']
['llm', 'llm', 'llm', 'llm', 'llm', 'llm', 'g2v', 'llm', 'llm', 'llm']
['improved writing', 'dynamic style', 'Part-of-Speech Unigram:Adverb', 'unique word selections', 'casual tone', 'strong visuals', 'hearted atmosphere', 'style is informal', 'literary devices', 'tone is positive']
['llm', 'llm', 'g2v', 'llm', 'llm', 'llm', 'llm', 'llm', 'llm', 'llm']
['compl

In [94]:
interp_spaces['gram2vec'][-1].head()

Unnamed: 0,cluster_label,style_feats,num_feats,num_documents
0,0,"[(Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 131.02775417812242), (Part-of-Speech Unigram:Proper noun, 120.37283153046812), (Part-of-Speech Bigram:Proper noun followed by Other, 102.20907778184412), (Morphology Tag:Comma punctuation type, 55.79294317523215), (Dependency Label:Object of preposition, 46.84675719527894), (Part-of-Speech Bigram:Other followed by Proper noun, 42.02924258041938), (Part-of-Speech Bigram:Other followed by Punctuation, 39.20883457530391), (Dependency Label:Appositional modifier, 34.478090413118295), (punctuation:,, 33.43863491498796), (Dependency Label:Prepositional modifier, 24.66571461542978), (Part-of-Speech Unigram:Adposition (preposition or postposition), 19.997055686634315), (Part-of-Speech Unigram:Punctuation, 15.388914333349422), (Dependency Label:Noun phrase as adverbial modifier, 14.902192988849615), (Part-of-Speech Bigram:Other followed by Other, 11.785903752756516), (Dependency Label:Punctuation, 10.809595623388658), (Part-of-Speech Bigram:Proper noun followed by Proper noun, 9.486092586645196), (Part-of-Speech Bigram:Punctuation followed by Other, 8.837390855544697), (Morphology Tag:Third person, 6.972482233897174), (Dependency Label:Nominal modifier, 6.004177511488482), (Morphology Tag:Past tense, 4.694256129153166), (Part-of-Speech Bigram:Noun followed by Punctuation, 3.529123158143493), (Part-of-Speech Unigram:Verb, 2.6354238904222647), (Part-of-Speech Unigram:Pronoun, 1.5749697240010292), (Morphology Tag:Personal pronoun type, 1.505021649615636), (Part-of-Speech Unigram:Noun, 0.6339761166199728), (Dependency Label:Nominal subject, 0.36624160262986766), (Morphology Tag:Singular number, 0.020477816023608483)]",27,47
1,1,"[(Dependency Label:Object of preposition, 8.198182509173813), (Part-of-Speech Unigram:Adverb, 6.864553609200628), (Morphology Tag:Positive degree, 6.576629541298957), (Dependency Label:Adverbial modifier, 6.539921988538429), (Part-of-Speech Unigram:Adjective, 6.391763000506573), (Dependency Label:Adjectival modifier, 5.789182960835995), (Dependency Label:Prepositional modifier, 5.6920879881761035), (Part-of-Speech Unigram:Determiner, 4.271459574169029), (Part-of-Speech Unigram:Adposition (preposition or postposition), 4.209906460344066), (Morphology Tag:Article pronoun type, 4.017109289939661), (Dependency Label:Determiner, 3.733700886050131), (Part-of-Speech Bigram:Determiner followed by Noun, 3.206179073723333), (Part-of-Speech Unigram:Punctuation, 2.946813382981804), (Morphology Tag:Present tense, 2.915812435900883), (Part-of-Speech Unigram:Proper noun, 2.616800685444959), (Morphology Tag:Third person, 2.3241607446323913), (Dependency Label:Punctuation, 2.0699225661808067), (Part-of-Speech Unigram:Auxiliary verb, 1.891376864445471), (Part-of-Speech Unigram:Verb, 1.3177119452111323), (Dependency Label:Nominal subject, 1.098724807889603), (Part-of-Speech Unigram:Pronoun, 1.0499798160006861), (Part-of-Speech Unigram:Noun, 0.18847938602215406), (Morphology Tag:Singular number, 0.004792680345950921)]",23,11
2,2,"[(Part-of-Speech Unigram:Adverb, 8.580692011500785), (Morphology Tag:Positive degree, 6.576629541298957), (Dependency Label:Adverbial modifier, 6.539921988538429), (Dependency Label:Determiner, 6.533976550587729), (Part-of-Speech Unigram:Determiner, 5.980043403836641), (Dependency Label:Object of preposition, 5.8558446494098675), (Dependency Label:Prepositional modifier, 5.6920879881761035), (Part-of-Speech Bigram:Noun followed by Adposition, 5.470095025558225), (Part-of-Speech Unigram:Adjective, 5.113410400405258), (Part-of-Speech Unigram:Adposition (preposition or postposition), 4.736144767887074), (Morphology Tag:Present tense, 2.915812435900883), (Dependency Label:Adjectival modifier, 2.8945914804179975), (Dependency Label:Direct object, 2.891970246938123), (Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 2.848429438654835), (Part-of-Speech Unigram:Punctuation, 1.6371185461010023), (Dependency Label:Punctuation, 1.3799483774538712), (Part-of-Speech Unigram:Verb, 1.17129950685434), (Part-of-Speech Unigram:Pronoun, 1.0499798160006861), (Dependency Label:Nominal subject, 0.7324832052597353), (Part-of-Speech Unigram:Noun, 0.17134489638377642), (Morphology Tag:Singular number, 0.004356982132682655)]",21,10
3,3,"[(Morphology Tag:Finite verb form, 9.328611254094541), (Dependency Label:Auxiliary verb, 9.151480422533972), (Morphology Tag:Present tense, 8.747437307702649), (Part-of-Speech Unigram:Auxiliary verb, 7.565507457781884), (Morphology Tag:Infinitive verb form, 6.640166278208479), (Part-of-Speech Bigram:Verb followed by Pronoun, 5.403403651059552), (Morphology Tag:Personal pronoun type, 4.5150649488469075), (Dependency Label:Adverbial modifier, 3.923953193123058), (Morphology Tag:Plural number, 3.917409929716573), (Dependency Label:Nominal subject, 3.6624160262986765), (Part-of-Speech Unigram:Adverb, 3.432276804600314), (Dependency Label:Root of the sentence, 3.084818216719065), (Dependency Label:Adjectival modifier, 2.8945914804179975), (Part-of-Speech Unigram:Pronoun, 2.6249495400017153), (Part-of-Speech Unigram:Adposition (preposition or postposition), 2.104953230172033), (Dependency Label:Prepositional modifier, 1.897362662725368), (Dependency Label:Determiner, 1.8668504430250654), (Part-of-Speech Unigram:Determiner, 1.7085838296676117), (Part-of-Speech Unigram:Punctuation, 1.6371185461010023), (Part-of-Speech Unigram:Verb, 1.464124383567925), (Part-of-Speech Unigram:Adjective, 1.2783526001013146), (Dependency Label:Object of preposition, 1.1711689298819734), (Dependency Label:Punctuation, 1.1499569812115593), (Part-of-Speech Unigram:Noun, 0.17134489638377642), (Morphology Tag:Singular number, 0.004356982132682655)]",25,10
4,4,"[(Part-of-Speech Unigram:Adverb, 8.580692011500785), (Dependency Label:Adverbial modifier, 7.847906386246116), (Dependency Label:Determiner, 7.467401772100262), (Morphology Tag:Third person, 6.972482233897174), (Dependency Label:Object of preposition, 5.8558446494098675), (Morphology Tag:Present tense, 5.831624871801766), (Part-of-Speech Unigram:Determiner, 5.125751489002835), (Dependency Label:Prepositional modifier, 4.74340665681342), (Part-of-Speech Unigram:Adjective, 3.835057800303944), (Part-of-Speech Unigram:Adposition (preposition or postposition), 3.683668152801058), (Morphology Tag:Positive degree, 3.2883147706494786), (Part-of-Speech Unigram:Punctuation, 2.946813382981804), (Dependency Label:Direct object, 2.891970246938123), (Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 2.848429438654835), (Dependency Label:Punctuation, 2.5299053586654305), (Dependency Label:Nominal subject, 1.8312080131493382), (Part-of-Speech Unigram:Verb, 1.17129950685434), (Part-of-Speech Unigram:Pronoun, 0.5249899080003431), (Part-of-Speech Unigram:Noun, 0.18847938602215406), (Morphology Tag:Singular number, 0.004792680345950921)]",20,11


In [95]:
interp_spaces['llm'][-1].head()

Unnamed: 0,cluster_label,style_feats,num_feats,num_documents
0,0,"[(tense usage, 75.39465300649763), (style is formal, 74.2656332899042), (precise language, 64.89579206072429), (more information, 53.21230144554308), (additional clauses, 50.54654731343905), (professional tone, 45.220097300138846), (effective communication, 43.75176563490477), (formal manner, 43.307045578469875), (clear sentence style, 41.348012969058935), (various sentence structures, 39.01428002735918), (methodical approach, 36.13903865732184), (parallel structures, 34.69948778854563), (diverse sentence structures, 30.563382515793197), (transitional elements, 29.983581266085498), (consistent tone, 26.554425596411537), (subordinate clauses, 23.95746896636315), (interpersonal relationships, 23.215900065691077), (possessive pronouns, 22.02016453489306), (style is professional, 21.362952329727957), (transitional phrases effectively, 21.136458524011243), (extra details, 20.5002638677571), (transitional phrases, 19.918756484602568), (previous events, 19.057016982331408), (extra information, 17.924861079259195), (different sentence structures, 17.3474628812382), (relative clauses, 17.176173063101245), (more context, 16.737447317630963), (better understanding, 16.62422633368242), (rhythmic impact, 16.308144786708944), (tone is contemplative, 15.678983193762733), (rhetorical devices, 15.488973612870613), (diverse information, 14.090972349340829), (verb tense, 14.090972349340829), (passive voice, 13.803324277842982), (comprehensive summary, 13.515608204437266), (emotional level, 13.2800421331245), (structured manner, 11.785613329464057), (literary devices, 11.718512771362786), (structured approach, 11.492406381080306), (advanced sentence structures, 11.403502855938639), (overall impact, 11.318383627101047), (various sentence structures effectively, 11.158298211753975), (different lengths, 11.01008226744653), (particular structure, 10.968529304736109), (diverse sentence lengths, 10.87855215591072), (direct engagement, 9.623787906326639), (varied sentence structures, 9.325716748005664), (formal language, 8.484251587527758), (figurative language, 8.202114773173552), (overall flow, 7.738633355230359), (specific words, 7.738633355230359), (additional details, 7.738633355230359), (sentence is complex, 7.738633355230359), (chronological narrative structure, 7.738633355230359), (different methods, 7.286150558516912), (important information, 7.2278077314643685), (various connectors, 7.20160736213704), (multiple elements, 7.0454861746704145), (overall rhythm, 6.936647958165022), (selective use, 6.8913354948431556), (interpersonal connections, 6.8913354948431556), (detailed language, 6.757804102218633), (style is polite, 6.439350371100098), (casual language, 6.3578122485224915), (detailed information, 6.064656921658687), (detailed descriptions, 5.659191813550524), (simple language structure, 5.579149105876987), (complex sentence structures, 5.519659081534761), (logical flow, 5.436048262236314), (parallel structure, 5.371509741098743), (structured word order, 5.148366189784533), (improved flow, 5.123673577194161), (rhythmic effect, 5.076045528204907), (specific pattern, 5.008604247409374), (additional information, 4.951886332650948), (descriptive language, 4.744963108934412), (accessible language, 4.742901081676369), (consistent verb tense, 4.734388547404105), (formal style, 4.480536817208877), (smooth connections, 4.44279648922603), (smooth transitions, 4.44279648922603), (elicit emotions, 4.418405036101871), (simple sentence structures, 4.405224572073567), (style is clear, 4.164416809436563), (different ideas, 4.136765278106052), (friendly tone, 4.058122150786939), (better rhythm, 4.033224599165212), (formal tone, 3.3483079177417827), (neutral tone, 3.335987433353742), (different techniques, 2.9237426382601988)]",90,47
1,1,"[(passive voice, 13.803324277842982), (overall rhythm, 11.561079930275035), (complex ideas, 10.197499004925822), (cohesive narrative, 9.810840022348287), (vivid language, 8.90619839672284), (better engagement, 8.444250254114419), (rhetorical devices, 7.744486806435306), (dynamic narrative flow, 7.450951282778578), (conceptual ideas, 7.2278077314643685), (extra components, 6.8913354948431556), (selective use, 6.8913354948431556), (tone is formal, 6.757804102218633), (neutral tone, 6.671974866707484), (verb tenses effectively, 6.64002106656225), (diverse sentence structures, 6.269411798111425), (intricate sentences, 6.19818831428321), (wide range, 5.659191813550524), (complex language, 5.618369819030268), (analysis is comprehensive, 5.54140877789414), (important ideas, 5.403258439413323), (improved flow, 5.123673577194161), (literary devices, 5.02221975915548), (vivid imagery, 4.987098042188411), (casual tone, 4.983218566139747), (advanced ideas, 4.966044632990578), (additional information, 4.951886332650948), (relative clauses, 4.907478018028927), (different methods, 4.857433705677941), (abstract concepts, 4.480536817208877), (dynamic style, 4.430526396634216), (improved impact, 4.371337525243885), (subordinate clauses, 4.355903448429664), (technical terms, 4.272897452430633), (different elements, 4.272897452430633), (intense feelings, 4.202516655668833), (writing is clear, 4.041455098301728), (structure is clear, 3.993058557439878), (intricate sentence structures, 3.924590758162417), (complex sentences, 3.80029304148484), (style is formal, 3.7132816644952102), (author is informal, 3.7014472068482074), (extra elements, 3.606137027043882), (various connectors, 3.60080368106852), (active voice, 3.1847564636298187), (descriptive language, 3.163308739289608), (different techniques, 2.9237426382601988), (grammatical structures, 2.8186524294022344), (complex sentence structures, 2.7598295407673805), (rhetorical questions, 2.5218876423761345), (figurative language, 2.050528693293388), (various sentence structures, 1.9507140013679587), (style is informal, 1.692049508901298), (casual language, 1.5894530621306229), (consistent verb tense, 1.5781295158013684), (diverse sentence lengths, 1.2087280173234134), (informal language, 1.17854608563017)]",56,11
2,2,"[(improved writing, 9.420222517706755), (dynamic style, 8.861052793268431), (unique word selections, 7.738633355230359), (casual tone, 7.474827849209619), (strong visuals, 7.450951282778578), (hearted atmosphere, 7.450951282778578), (style is informal, 6.768198035605192), (literary devices, 6.696293012207306), (tone is positive, 6.64002106656225), (deep themes, 6.534660550904423), (subordinate clauses enhances, 6.534660550904423), (diverse sentence structures, 6.269411798111425), (common language, 6.19818831428321), (various sentence structures, 5.852142004103876), (different techniques, 5.8474852765203975), (personal opinions, 5.792723206175046), (analysis is comprehensive, 5.54140877789414), (straightforward approach, 5.54140877789414), (logical flow, 5.436048262236314), (proper grammar, 5.371509741098743), (smooth reading experience, 5.371509741098743), (relaxed style, 5.015049476751545), (descriptive language, 4.744963108934412), (informal language, 4.71418434252068), (brief language, 4.6783625605387975), (overall rhythm, 4.624431972110014), (emotional expression, 4.603139139301209), (simple sentence structures, 4.405224572073567), (friendly tone, 4.058122150786939), (informal style, 3.860511901477895), (complex ideas, 3.399166334975274), (diverse vocabulary, 3.2920681994189067), (casual language, 3.1789061242612457), (grammatical structures, 2.8186524294022344), (rhetorical devices, 2.581495602145102), (additional information, 2.475943166325474), (relative clauses, 2.4537390090144635), (diverse sentence lengths, 2.417456034646827), (figurative language, 2.050528693293388), (style is casual, 1.8960555888300947), (consistent verb tense, 1.5781295158013684), (particular structure, 1.566932757819444)]",42,10
3,3,"[(complex sentence structures, 10.349360777877678), (particular structure, 9.401596546916664), (straightforward sentence structures, 9.325716748005664), (diverse sentence lengths, 8.461096121263894), (rhetorical questions, 7.565662927128404), (additional information, 7.427829498976422), (style is informal, 6.768198035605192), (complex language structures, 6.696615835483565), (vivid language, 6.6796487975421295), (author is moderately complex, 6.64002106656225), (linear narrative format, 6.534660550904423), (style is contemplative, 6.352338994110469), (straightforward structure, 6.004032299842253), (rhetorical questions enhances reader engagement, 5.792723206175046), (thoughtful reflection, 5.340738082431989), (specific pattern, 5.008604247409374), (personal stories, 4.966044632990578), (different tones, 4.945425345787842), (strong feelings, 4.829912458665998), (relaxed feel, 4.455219009224588), (prior knowledge, 4.242125793763879), (limited range, 4.164416809436563), (writing is clear, 4.041455098301728), (diverse sentence structures, 3.9183823738196404), (style is casual, 3.7921111776601895), (verb tenses, 3.774650610871502), (various connectors, 3.60080368106852), (literary devices, 3.348146506103653), (active voice, 3.1847564636298187), (different types, 2.5911388784169063), (casual tone, 2.4916092830698733), (style is friendly, 2.3225329530259393), (passive voice, 2.300554046307164), (different voices, 2.0873144501098992), (figurative language, 2.050528693293388), (various sentence types, 1.995630167420877), (various sentence structures, 1.9507140013679587), (casual language, 1.5894530621306229), (descriptive language, 1.581654369644804), (consistent verb tense, 1.5781295158013684), (informal language, 1.17854608563017)]",41,10
4,4,"[(passive voice, 16.10387832415015), (different sentence structures, 12.143224016866743), (descriptive language, 11.071580587513628), (various connectors, 10.80241104320556), (rhetorical devices, 10.325982408580408), (relative clauses, 9.814956036057854), (active voice, 9.554269390889456), (overall rhythm, 9.248863944220028), (different elements, 8.545794904861266), (technical terms, 8.545794904861266), (writer is attentive, 7.738633355230359), (consistent rhythm, 7.738633355230359), (stylistic purposes, 7.450951282778578), (style is formal, 7.4265633289904205), (unbiased evaluation, 7.0454861746704145), (extra details, 6.8334212892523665), (various contexts, 6.757804102218633), (smooth idea transitions, 6.64002106656225), (subordinate clauses, 6.533855172644495), (diverse voices, 6.352338994110469), (author is conversational, 6.19818831428321), (verb structures, 6.19818831428321), (evaluative language, 6.19818831428321), (dynamic effect, 6.004032299842253), (thorough examination, 5.892806664732029), (personal views, 5.792723206175046), (advanced sentence structures, 5.701751427969319), (overall impact, 5.659191813550524), (clear perspective, 5.659191813550524), (engaging tone, 5.505041133723265), (various voice types, 5.199659484172083), (literary devices, 5.02221975915548), (specific pattern, 5.008604247409374), (additional information, 4.951886332650948), (thorough information, 4.945425345787842), (diverse sentence structures, 4.702058848583569), (varied sentence lengths, 4.662858374002832), (critical thinking, 4.617737938722362), (unique sentence structures, 4.5745657668571535), (tone is informal, 4.560579524882414), (smooth transitions, 4.44279648922603), (simple sentence structures, 4.405224572073567), (improved impact, 4.371337525243885), (precise language, 4.326386137381619), (transitional elements, 4.283368752297928), (style is clear, 4.164416809436563), (different ideas, 4.136765278106052), (figurative language, 4.101057386586776), (verb tenses, 3.774650610871502), (professional tone, 3.7683414416782375), (extra elements, 3.606137027043882), (different language styles, 3.494911391933658), (brief sentences, 3.421145241694049), (transitional phrases, 3.3197927474337616), (particular structure, 3.133865515638888), (more information, 3.130135379149593), (different techniques, 2.9237426382601988), (additional clauses, 2.5273273656719524), (relaxed style, 2.5075247383757726), (vivid language, 2.22654959918071), (different voices, 2.0873144501098992), (various sentence types, 1.995630167420877), (various sentence structures, 1.9507140013679587), (style is casual, 1.8960555888300947), (style is informal, 1.692049508901298), (casual language, 1.5894530621306229), (consistent verb tense, 1.5781295158013684), (informal language, 1.17854608563017), (complex sentence structures, 0.6899573851918451)]",69,11


In [96]:
interp_spaces['both'][-1].head()

Unnamed: 0,cluster_label,style_feats,num_feats,num_documents
0,0,"[(Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 131.02107444239587), (Part-of-Speech Unigram:Proper noun, 120.45769411351003), (Part-of-Speech Bigram:Proper noun followed by Other, 102.2060283372733), (tense usage, 75.39465300649763), (style is formal, 74.2656332899042), (precise language, 64.89579206072429), (Morphology Tag:Comma punctuation type, 55.79149105876987), (more information, 53.21230144554308), (additional clauses, 50.54654731343905), (Dependency Label:Object of preposition, 46.84094872942977), (professional tone, 45.220097300138846), (effective communication, 43.75176563490477), (formal manner, 43.307045578469875), (Part-of-Speech Bigram:Other followed by Proper noun, 42.02822609889577), (clear sentence style, 41.348012969058935), (Part-of-Speech Bigram:Other followed by Punctuation, 39.20796330542654), (various sentence structures, 39.01428002735918), (methodical approach, 36.13903865732184), (Dependency Label:Appositional modifier, 34.75633923705028), (parallel structures, 34.69948778854563), (punctuation:,, 33.43761843346435), (diverse sentence structures, 30.563382515793197), (transitional elements, 29.983581266085498), (consistent tone, 26.554425596411537), (Dependency Label:Prepositional modifier, 24.661939112627824), (subordinate clauses, 23.95746896636315), (interpersonal relationships, 23.215900065691077), (possessive pronouns, 22.02016453489306), (style is professional, 21.362952329727957), (transitional phrases effectively, 21.136458524011243), (extra details, 20.5002638677571), (Part-of-Speech Unigram:Adposition (preposition or postposition), 19.991537644077606), (transitional phrases, 19.918756484602568), (previous events, 19.057016982331408), (extra information, 17.924861079259195), (different sentence structures, 17.3474628812382), (relative clauses, 17.176173063101245), (more context, 16.737447317630963), (better understanding, 16.62422633368242), (rhythmic impact, 16.308144786708944), (tone is contemplative, 15.678983193762733), (rhetorical devices, 15.488973612870613), (Part-of-Speech Unigram:Punctuation, 15.382089385976649), (Dependency Label:Noun phrase as adverbial modifier, 14.901902565557156), (diverse information, 14.090972349340829), (verb tense, 14.090972349340829), (passive voice, 13.803324277842982), (comprehensive summary, 13.515608204437266), (emotional level, 13.2800421331245), (structured manner, 11.785613329464057), (Part-of-Speech Bigram:Other followed by Other, 11.785613329464057), (literary devices, 11.718512771362786), (structured approach, 11.492406381080306), (advanced sentence structures, 11.403502855938639), (overall impact, 11.318383627101047), (various sentence structures effectively, 11.158298211753975), (different lengths, 11.01008226744653), (particular structure, 10.968529304736109), (diverse sentence lengths, 10.87855215591072), (Dependency Label:Punctuation, 10.802770676015886), (direct engagement, 9.623787906326639), (Part-of-Speech Bigram:Proper noun followed by Proper noun, 9.519416399985499), (varied sentence structures, 9.325716748005664), (Part-of-Speech Bigram:Punctuation followed by Other, 8.83724564389847), (formal language, 8.484251587527758), (figurative language, 8.202114773173552), (overall flow, 7.738633355230359), (specific words, 7.738633355230359), (additional details, 7.738633355230359), (sentence is complex, 7.738633355230359), (chronological narrative structure, 7.738633355230359), (different methods, 7.286150558516912), (important information, 7.2278077314643685), (various connectors, 7.20160736213704), (multiple elements, 7.0454861746704145), (Morphology Tag:Third person, 6.972046598958486), (overall rhythm, 6.936647958165022), (selective use, 6.8913354948431556), (interpersonal connections, 6.8913354948431556), (detailed language, 6.757804102218633), (style is polite, 6.439350371100098), (casual language, 6.3578122485224915), (detailed information, 6.064656921658687), (Dependency Label:Nominal modifier, 6.004032299842253), (detailed descriptions, 5.659191813550524), (simple language structure, 5.579149105876987), (complex sentence structures, 5.519659081534761), (logical flow, 5.436048262236314), (parallel structure, 5.371509741098743), (structured word order, 5.148366189784533), (improved flow, 5.123673577194161), (rhythmic effect, 5.076045528204907), (specific pattern, 5.008604247409374), (additional information, 4.951886332650948), (descriptive language, 4.744963108934412), (accessible language, 4.742901081676369), (consistent verb tense, 4.734388547404105), (Morphology Tag:Past tense, 4.694110917506936), (formal style, 4.480536817208877), (smooth connections, 4.44279648922603), ...]",117,47
1,1,"[(passive voice, 13.803324277842982), (overall rhythm, 11.561079930275035), (complex ideas, 10.197499004925822), (cohesive narrative, 9.810840022348287), (vivid language, 8.90619839672284), (better engagement, 8.444250254114419), (Dependency Label:Object of preposition, 8.197166027650209), (rhetorical devices, 7.744486806435306), (dynamic narrative flow, 7.450951282778578), (conceptual ideas, 7.2278077314643685), (extra components, 6.8913354948431556), (selective use, 6.8913354948431556), (Part-of-Speech Unigram:Adverb, 6.8639727626157105), (tone is formal, 6.757804102218633), (neutral tone, 6.671974866707484), (verb tenses effectively, 6.64002106656225), (Morphology Tag:Positive degree, 6.576339118006499), (Dependency Label:Adverbial modifier, 6.539195930307283), (Part-of-Speech Unigram:Adjective, 6.391036942275427), (diverse sentence structures, 6.269411798111425), (intricate sentences, 6.19818831428321), (Dependency Label:Adjectival modifier, 5.7888925375435365), (Dependency Label:Prepositional modifier, 5.691216718298729), (wide range, 5.659191813550524), (complex language, 5.618369819030268), (analysis is comprehensive, 5.54140877789414), (important ideas, 5.403258439413323), (improved flow, 5.123673577194161), (literary devices, 5.02221975915548), (vivid imagery, 4.987098042188411), (casual tone, 4.983218566139747), (advanced ideas, 4.966044632990578), (additional information, 4.951886332650948), (relative clauses, 4.907478018028927), (different methods, 4.857433705677941), (abstract concepts, 4.480536817208877), (dynamic style, 4.430526396634216), (improved impact, 4.371337525243885), (subordinate clauses, 4.355903448429664), (technical terms, 4.272897452430633), (different elements, 4.272897452430633), (Part-of-Speech Unigram:Determiner, 4.270733515937883), (Part-of-Speech Unigram:Adposition (preposition or postposition), 4.208744767174233), (intense feelings, 4.202516655668833), (writing is clear, 4.041455098301728), (Morphology Tag:Article pronoun type, 4.016964078293432), (structure is clear, 3.993058557439878), (intricate sentence structures, 3.924590758162417), (complex sentences, 3.80029304148484), (Dependency Label:Determiner, 3.733120039465214), (style is formal, 3.7132816644952102), (author is informal, 3.7014472068482074), (extra elements, 3.606137027043882), (various connectors, 3.60080368106852), (Part-of-Speech Bigram:Determiner followed by Noun, 3.209624530207832), (active voice, 3.1847564636298187), (descriptive language, 3.163308739289608), (Part-of-Speech Unigram:Punctuation, 2.945506478165741), (different techniques, 2.9237426382601988), (Morphology Tag:Present tense, 2.9156672242546535), (grammatical structures, 2.8186524294022344), (complex sentence structures, 2.7598295407673805), (Part-of-Speech Unigram:Proper noun, 2.6186455242067397), (rhetorical questions, 2.5218876423761345), (Morphology Tag:Third person, 2.324015532986162), (Dependency Label:Punctuation, 2.068615661364744), (figurative language, 2.050528693293388), (various sentence structures, 1.9507140013679587), (Part-of-Speech Unigram:Auxiliary verb, 1.8912316527992417), (style is informal, 1.692049508901298), (casual language, 1.5894530621306229), (consistent verb tense, 1.5781295158013684), (Part-of-Speech Unigram:Verb, 1.317918026865811), (diverse sentence lengths, 1.2087280173234134), (informal language, 1.17854608563017), (Dependency Label:Nominal subject, 1.0989175109990812), (Part-of-Speech Unigram:Pronoun, 1.0493989694157693), (Part-of-Speech Unigram:Noun, 0.18850699328847678), (Morphology Tag:Singular number, 0.0047933765011537845)]",79,11
2,2,"[(improved writing, 9.420222517706755), (dynamic style, 8.861052793268431), (Part-of-Speech Unigram:Adverb, 8.579965953269639), (unique word selections, 7.738633355230359), (casual tone, 7.474827849209619), (strong visuals, 7.450951282778578), (hearted atmosphere, 7.450951282778578), (style is informal, 6.768198035605192), (literary devices, 6.696293012207306), (tone is positive, 6.64002106656225), (Morphology Tag:Positive degree, 6.576339118006499), (Dependency Label:Adverbial modifier, 6.539195930307283), (deep themes, 6.534660550904423), (subordinate clauses enhances, 6.534660550904423), (Dependency Label:Determiner, 6.532960069064125), (diverse sentence structures, 6.269411798111425), (common language, 6.19818831428321), (Part-of-Speech Unigram:Determiner, 5.979026922313037), (Dependency Label:Object of preposition, 5.855118591178721), (various sentence structures, 5.852142004103876), (different techniques, 5.8474852765203975), (personal opinions, 5.792723206175046), (Dependency Label:Prepositional modifier, 5.691216718298729), (analysis is comprehensive, 5.54140877789414), (straightforward approach, 5.54140877789414), (Part-of-Speech Bigram:Noun followed by Adposition, 5.469949813911995), (logical flow, 5.436048262236314), (proper grammar, 5.371509741098743), (smooth reading experience, 5.371509741098743), (Part-of-Speech Unigram:Adjective, 5.112829553820342), (relaxed style, 5.015049476751545), (descriptive language, 4.744963108934412), (Part-of-Speech Unigram:Adposition (preposition or postposition), 4.734837863071012), (informal language, 4.71418434252068), (brief language, 4.6783625605387975), (overall rhythm, 4.624431972110014), (emotional expression, 4.603139139301209), (simple sentence structures, 4.405224572073567), (friendly tone, 4.058122150786939), (informal style, 3.860511901477895), (complex ideas, 3.399166334975274), (diverse vocabulary, 3.2920681994189067), (casual language, 3.1789061242612457), (Morphology Tag:Present tense, 2.9156672242546535), (Dependency Label:Adjectival modifier, 2.8944462687717682), (Dependency Label:Direct object, 2.891825035291894), (Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 2.8482842270086057), (grammatical structures, 2.8186524294022344), (rhetorical devices, 2.581495602145102), (additional information, 2.475943166325474), (relative clauses, 2.4537390090144635), (diverse sentence lengths, 2.417456034646827), (figurative language, 2.050528693293388), (style is casual, 1.8960555888300947), (Part-of-Speech Unigram:Punctuation, 1.6363924878698564), (consistent verb tense, 1.5781295158013684), (particular structure, 1.566932757819444), (Dependency Label:Punctuation, 1.3790771075764963), (Part-of-Speech Unigram:Verb, 1.1714826905473876), (Part-of-Speech Unigram:Pronoun, 1.0493989694157693), (Dependency Label:Nominal subject, 0.7326116739993874), (Part-of-Speech Unigram:Noun, 0.17136999389861524), (Morphology Tag:Singular number, 0.004357615001048895)]",63,10
3,3,"[(complex sentence structures, 10.349360777877678), (particular structure, 9.401596546916664), (Morphology Tag:Finite verb form, 9.327885195863395), (straightforward sentence structures, 9.325716748005664), (Dependency Label:Auxiliary verb, 9.151044787595284), (Morphology Tag:Present tense, 8.747001672763961), (diverse sentence lengths, 8.461096121263894), (rhetorical questions, 7.565662927128404), (Part-of-Speech Unigram:Auxiliary verb, 7.564926611196967), (additional information, 7.427829498976422), (style is informal, 6.768198035605192), (Morphology Tag:Infinitive verb form, 6.757804102218633), (complex language structures, 6.696615835483565), (vivid language, 6.6796487975421295), (author is moderately complex, 6.64002106656225), (linear narrative format, 6.534660550904423), (style is contemplative, 6.352338994110469), (straightforward structure, 6.004032299842253), (rhetorical questions enhances reader engagement, 5.792723206175046), (Part-of-Speech Bigram:Verb followed by Pronoun, 5.403258439413323), (thoughtful reflection, 5.340738082431989), (specific pattern, 5.008604247409374), (personal stories, 4.966044632990578), (different tones, 4.945425345787842), (strong feelings, 4.829912458665998), (Morphology Tag:Personal pronoun type, 4.5146293139082205), (relaxed feel, 4.455219009224588), (prior knowledge, 4.242125793763879), (limited range, 4.164416809436563), (writing is clear, 4.041455098301728), (Dependency Label:Adverbial modifier, 3.92351755818437), (diverse sentence structures, 3.9183823738196404), (Morphology Tag:Plural number, 3.917264718070344), (style is casual, 3.7921111776601895), (verb tenses, 3.774650610871502), (Dependency Label:Nominal subject, 3.663058369996937), (various connectors, 3.60080368106852), (Part-of-Speech Unigram:Adverb, 3.4319863813078553), (literary devices, 3.348146506103653), (active voice, 3.1847564636298187), (Dependency Label:Root of the sentence, 3.084673005072836), (Dependency Label:Adjectival modifier, 2.8944462687717682), (Part-of-Speech Unigram:Pronoun, 2.623497423539423), (different types, 2.5911388784169063), (casual tone, 2.4916092830698733), (style is friendly, 2.3225329530259393), (passive voice, 2.300554046307164), (Part-of-Speech Unigram:Adposition (preposition or postposition), 2.1043723835871164), (different voices, 2.0873144501098992), (figurative language, 2.050528693293388), (various sentence types, 1.995630167420877), (various sentence structures, 1.9507140013679587), (Dependency Label:Prepositional modifier, 1.8970722394329096), (Dependency Label:Determiner, 1.866560019732607), (Part-of-Speech Unigram:Determiner, 1.7082934063751534), (Part-of-Speech Unigram:Punctuation, 1.6363924878698564), (casual language, 1.5894530621306229), (descriptive language, 1.581654369644804), (consistent verb tense, 1.5781295158013684), (Part-of-Speech Unigram:Verb, 1.4643533631842345), (Part-of-Speech Unigram:Adjective, 1.2782073884550855), (informal language, 1.17854608563017), (Dependency Label:Object of preposition, 1.1710237182357441), (Dependency Label:Punctuation, 1.1492309229804134), (Part-of-Speech Unigram:Noun, 0.17136999389861524), (Morphology Tag:Singular number, 0.004357615001048895)]",66,10
4,4,"[(passive voice, 16.10387832415015), (different sentence structures, 12.143224016866743), (descriptive language, 11.071580587513628), (various connectors, 10.80241104320556), (rhetorical devices, 10.325982408580408), (relative clauses, 9.814956036057854), (active voice, 9.554269390889456), (overall rhythm, 9.248863944220028), (Part-of-Speech Unigram:Adverb, 8.579965953269639), (different elements, 8.545794904861266), (technical terms, 8.545794904861266), (Dependency Label:Adverbial modifier, 7.84703511636874), (writer is attentive, 7.738633355230359), (consistent rhythm, 7.738633355230359), (Dependency Label:Determiner, 7.466240078930428), (stylistic purposes, 7.450951282778578), (style is formal, 7.4265633289904205), (unbiased evaluation, 7.0454861746704145), (Morphology Tag:Third person, 6.972046598958486), (extra details, 6.8334212892523665), (various contexts, 6.757804102218633), (smooth idea transitions, 6.64002106656225), (subordinate clauses, 6.533855172644495), (diverse voices, 6.352338994110469), (author is conversational, 6.19818831428321), (verb structures, 6.19818831428321), (evaluative language, 6.19818831428321), (dynamic effect, 6.004032299842253), (thorough examination, 5.892806664732029), (Dependency Label:Object of preposition, 5.855118591178721), (Morphology Tag:Present tense, 5.831334448509307), (personal views, 5.792723206175046), (advanced sentence structures, 5.701751427969319), (overall impact, 5.659191813550524), (clear perspective, 5.659191813550524), (engaging tone, 5.505041133723265), (various voice types, 5.199659484172083), (Part-of-Speech Unigram:Determiner, 5.12488021912546), (literary devices, 5.02221975915548), (specific pattern, 5.008604247409374), (additional information, 4.951886332650948), (thorough information, 4.945425345787842), (Dependency Label:Prepositional modifier, 4.7426805985822735), (diverse sentence structures, 4.702058848583569), (varied sentence lengths, 4.662858374002832), (critical thinking, 4.617737938722362), (unique sentence structures, 4.5745657668571535), (tone is informal, 4.560579524882414), (smooth transitions, 4.44279648922603), (simple sentence structures, 4.405224572073567), (improved impact, 4.371337525243885), (precise language, 4.326386137381619), (transitional elements, 4.283368752297928), (style is clear, 4.164416809436563), (different ideas, 4.136765278106052), (figurative language, 4.101057386586776), (Part-of-Speech Unigram:Adjective, 3.8346221653652566), (verb tenses, 3.774650610871502), (professional tone, 3.7683414416782375), (Part-of-Speech Unigram:Adposition (preposition or postposition), 3.682651671277454), (extra elements, 3.606137027043882), (different language styles, 3.494911391933658), (brief sentences, 3.421145241694049), (transitional phrases, 3.3197927474337616), (Morphology Tag:Positive degree, 3.2881695590032494), (particular structure, 3.133865515638888), (more information, 3.130135379149593), (Part-of-Speech Unigram:Punctuation, 2.945506478165741), (different techniques, 2.9237426382601988), (Dependency Label:Direct object, 2.891825035291894), (Part-of-Speech Unigram:Other (foreign words, typos, abbreviations), 2.8482842270086057), (Dependency Label:Punctuation, 2.5283080305569094), (additional clauses, 2.5273273656719524), (relaxed style, 2.5075247383757726), (vivid language, 2.22654959918071), (different voices, 2.0873144501098992), (various sentence types, 1.995630167420877), (various sentence structures, 1.9507140013679587), (style is casual, 1.8960555888300947), (Dependency Label:Nominal subject, 1.8315291849984685), (style is informal, 1.692049508901298), (casual language, 1.5894530621306229), (consistent verb tense, 1.5781295158013684), (informal language, 1.17854608563017), (Part-of-Speech Unigram:Verb, 1.1714826905473876), (complex sentence structures, 0.6899573851918451), (Part-of-Speech Unigram:Pronoun, 0.5246994847078846), (Part-of-Speech Unigram:Noun, 0.18850699328847678), (Morphology Tag:Singular number, 0.0047933765011537845)]",89,11
