In [1]:
from contrastive_keyword_extraction import contrastive_extraction, final_score, combine_keywords
import sqlite3
import pandas as pd
from policy_processing import *
from cleantext import clean
from baselines import *
from tqdm import trange
import string
import pickle
import sentence_comparision
import sentence_importance
import summary



In [2]:
conn = sqlite3.connect('../datasets/small10k.sqlite')
df = pd.read_sql("SELECT * FROM small10k", con=conn)

In [3]:
cleaning_func = lambda text : clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"              
)

In [4]:
with open("usefull_ids.pkl", "rb") as file:
    # read list from file
    all_usefull_ids = pickle.load(file)

In [5]:
def create_collection(df, 
                      sites, 
                      ke_extractor = keyword_extraction.extract_yake, 
                      num_keywords=10,
                      max_ngram=2, 
                      sentence_matcher = sentence_comparision.match_sentences_semantic_search,
                      importance_estimator = sentence_importance.text_rank_importance,
                      use_furthest=False, 
                      name_prefix="",
                      make_data_persistent=False, 
                      path="dataframes"):
    
    for i in trange(len(sites)):

        site_id = sites[i]

        # sort first by year, then by phase
        data = create_data(df.sort_values(by=['year', 'phase']), site_id)

        # get_the actual strings
        policy_texts = get_policy_texts(data)

        # cleaned documents using above function
        documents = clean_text(policy_texts, cleaning_func)
        
        # use only the first and last version
        if use_furthest:
            documents = [documents[0], documents[-1]]
        
        # run CKE-pipeline
        keywords, matched_dicts, changed_sentences, added, deleted = contrastive_extraction(documents, 
                                                                     max_ngram=max_ngram,
                                                                     min_ngram=1, show_changes=False, 
                                                                     symbols_to_remove=string.punctuation,
                                                                     match_sentences= sentence_matcher,
                                                                     importance_estimator= importance_estimator)
        
        
        # save total
        total_keywords = combine_keywords(keywords)

        total_frame = pd.DataFrame({'keyword' : total_keywords.keys(), 'score': total_keywords.values()})
        
        

        
        # save itermediate
        inter_kws, inter_scores, delta_int = create_inter_frame(keywords)
        
        intermediate_frame = pd.DataFrame({'delta': delta_int, 'keyword': inter_kws, 'score': inter_scores})
        
        
        
        
        # create CKE on the specified baseline
        baseline_keywords = baseline_diff_content(added, lambda x: ke_extractor(x, max_ngram_size=max_ngram,
                                                                               numOfKeywords=num_keywords))
        
        baseline_kws, baseline_scores, delta_list = create_baseline_frame(baseline_keywords)
        
        baseline_frame = pd.DataFrame({'delta': delta_list, 'keyword': baseline_kws, 'score': baseline_scores})
        
        
        if make_data_persistent:
            
            intermediate_frame.to_csv(f"{path}/{name_prefix}_inter_keywords_{site_id}.csv", index=False)
            
            total_frame.to_csv(f"{path}/{name_prefix}_keywords_{site_id}.csv", index=False)
            
            baseline_frame.to_csv(f"{path}/{name_prefix}_baseline_keywords_{site_id}.csv", index=False)
        
    
    return total_frame, intermediate_frame, baseline_frame

In [6]:
from itertools import product


def cartesian_product(params):
    
    # gett all possible combinations
    return list(product(*params.values()))



In [7]:
parameters = {"matcher": [sentence_comparision.match_sentences_semantic_search,
                          sentence_comparision.match_sentences_tfidf_weighted],
              
             "importance": [sentence_importance.text_rank_importance, 
                            sentence_importance.yake_weighted_importance]
             }


In [8]:
def create_cartesian_collection(params, 
                                df,
                                sites, 
                                baseline_ke_extractor = keyword_extraction.extract_yake, 
                                num_keywords=10, 
                                max_ngram=2, 
                                use_furthest=False,
                                make_data_persistent=False,
                                path="dataframes",
                                compare_k = 15):
            
        
    combinations = cartesian_product(params)
    
    number_of_combinations = len(combinations)
    
    count = 0
    
    for combination in combinations:
        
        matcher, importance_estimator = combination
        
        print(f"Contrastive Keyword Extraction pipeline is being ran with combination {count}:")
        
        total_frame, intermediate_frame, baseline_frame = create_collection(df = df, 
                                                                  sites = sites, 
                                                                  ke_extractor = baseline_ke_extractor, 
                                                                  num_keywords = num_keywords, 
                                                                  max_ngram = max_ngram, 
                                                                  sentence_matcher = matcher,
                                                                  importance_estimator = importance_estimator,
                                                                  use_furthest = use_furthest,
                                                                  name_prefix=f"combination_{count}",
                                                                  make_data_persistent=make_data_persistent,
                                                                  path = path)
        
        
        
        summary.extensive_summary(sites, 
                                  show_results=True, 
                                  k=compare_k,
                                  name_a = f"combination_{count}_inter_keywords", 
                                  name_b = f"combination_{count}_baseline_keywords",
                                  save_prefix=f"combination_{count}_", 
                                  path=path)
        
        
        count += 1
        

In [None]:
create_cartesian_collection(parameters,
                            df,
                            all_usefull_ids[:50],
                            baseline_ke_extractor = keyword_extraction.extract_yake,
                            num_keywords=15,
                            max_ngram=3,
                            use_furthest=True, # only compare the first and last document
                            make_data_persistent=True,
                            path="combination",
                            compare_k = 20)

Contrastive Keyword Extraction pipeline is being ran with combination 0:


100%|██████████| 50/50 [04:32<00:00,  5.45s/it]


Unnamed: 0,Site,Delta,F1,Precision,Recall,IoU,#overlaps
0,106506,0,0.0,0.0,0.0,0.0,0
1,98325,0,0.166667,0.333333,0.25,0.166667,5
2,106533,0,0.06,0.2,0.15,0.096774,3
3,90158,0,0.166667,0.333333,0.25,0.166667,5
4,90162,0,0.426667,0.533333,0.4,0.296296,8
5,98356,0,0.326667,0.466667,0.35,0.28,7
6,106560,0,0.15,0.5,0.15,0.130435,3
7,106565,0,0.0,0.0,0.0,0.0,0
8,98377,0,0.5,0.5,0.5,0.333333,1
9,90195,0,0.514286,0.6,0.45,0.346154,9


Contrastive Keyword Extraction pipeline is being ran with combination 1:


 74%|███████▍  | 37/50 [03:09<01:24,  6.49s/it]