# Keyword Collection Creation

In [1]:
from contrastive_keyword_extraction import contrastive_extraction, final_score
import sqlite3
import pandas as pd
from policy_processing import *
from cleantext import clean
from baselines import *
from tqdm import trange
import string
import pickle
import sentence_comparision
import sentence_importance
import summary
import utilities
import news_processing



In [2]:
conn = sqlite3.connect('/home/lukas/Documents/semester6/NLP/newsEdits/datasets/small10k.sqlite')
conn_news = sqlite3.connect('/home/lukas/Documents/semester6/NLP/newsEdits/datasets/ap-matched-sentences.db')


In [3]:
cleaning_func = lambda text : clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=False,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"              
)

In [4]:
def create_collection(conn, 
                      sites, 
                      ke_extractor = keyword_extraction.extract_yake, 
                      num_keywords=10,
                      max_ngram=2, 
                      sentence_matcher = sentence_comparision.match_sentences_semantic_search,
                      importance_estimator = sentence_importance.text_rank_importance,
                      use_furthest=False, 
                      name_prefix="",
                      make_data_persistent=False, 
                      path="dataframes",
                      threshold=0.6,
                      stopwords=[],
                      combinator=utilities.alpha_combination,
                      gamma = 0.5,
                      num_splits=1,
                      is_policy=True,
                      matching_model="all-MiniLM-L6-v2"):
    
    for i in trange(len(sites)):

        site_id = sites[i]
        
        if is_policy:
            # sort first by year, then by phase
            df = pd.read_sql("SELECT * FROM small10k", con=conn)
            
            data = create_data(df.sort_values(by=['year', 'phase']), site_id)

            # get_the actual strings
            policy_texts = get_policy_texts(data)

            # cleaned documents using above function
            documents = clean_text(policy_texts, cleaning_func)
        else:
            documents = news_processing.parse_html_to_string(article_id, conn)

        documents = [documents[0], documents[-1]]
        
        # run CKE-pipeline
        # Extract Keywords, and Matched sentences
        keywords, matched_dict, changed_indices, additions, deletions, new_indices, ranking, removed, matched_indices, unified_delitions = contrastive_extraction(documents, 
                                                                            max_ngram=max_ngram,
                                                                            min_ngram=1, 
                                                                            show_changes=False, 
                                                                            symbols_to_remove=string.punctuation,
                                                                            importance_estimator=importance_estimator,
                                                                            match_sentences=sentence_matcher,
                                                                            threshold=threshold,
                                                                            extra_stopwords=stopwords,
                                                                            top_k=int(num_splits),
                                                                            combinator=combinator,
                                                                            alpha_gamma=gamma,
                                                                            matching_model=matching_model)
        
        

    
        # create itermediate
        kws, scores = extract_from_dict(keywords)
        
        pipeline_frame = pd.DataFrame({'keyword': kws, 'score': scores})
        
        #extractor = lambda x: ke_extractor(x, max_ngram_size=max_ngram, numOfKeywords=num_keywords)
        
        # create CKE on the specified baseline
        baseline_keywords = baseline_diff_content(additions, unified_delitions, ke_extractor, num_keywords, max_ngram)
        
        baseline_kws, baseline_scores = extract_from_tuple_list(baseline_keywords)
        
        baseline_frame1 = pd.DataFrame({'keyword': baseline_kws, 'score': baseline_scores})
        
        
        # create CKE for baseline method 2
        baseline_keywords2 = baseline_keywords_in_diff(documents, ke_extractor, additions, deletions, candidates=50, max_ngram=max_ngram)
        
        baseline_kws2, baseline_scores2 = extract_from_dict(baseline_keywords2)
        
        baseline_frame2 = pd.DataFrame({'keyword': baseline_kws2, 'score': baseline_scores2})
        
        # Baseline 3
        baseline_keywords3 = baseline3(documents, additions, unified_delitions, max_ngram)
        
        baseline_kws3, baseline_scores3 = extract_from_dict(baseline_keywords3)
        
        baseline_frame3 = pd.DataFrame({'keyword': baseline_kws3, 'score': baseline_scores3})

        # Baseline 4
        baseline_keywords4 = baseline4(documents, max_ngram, stopwords)
        
        baseline_kws4, baseline_scores4 = extract_from_dict(baseline_keywords4)
        
        baseline_frame4 = pd.DataFrame({'keyword': baseline_kws4, 'score': baseline_scores4})

        # decide, wether to actually save the data
        if make_data_persistent:
            
            pipeline_frame.to_csv(f"{path}/{name_prefix}_pipeline_keywords_{site_id}.csv", index=False)
            
            baseline_frame1.to_csv(f"{path}/{name_prefix}_baseline1_keywords_{site_id}.csv", index=False)
            
            baseline_frame2.to_csv(f"{path}/{name_prefix}_baseline2_keywords_{site_id}.csv", index=False)
            
            baseline_frame3.to_csv(f"{path}/{name_prefix}_baseline3_keywords_{site_id}.csv", index=False)
            
            baseline_frame4.to_csv(f"{path}/{name_prefix}_baseline4_keywords_{site_id}.csv", index=False)
    
    return pipeline_frame, baseline_frame1, baseline_frame2, baseline_frame3, baseline_frame4

In [5]:
ids = [90536,
 90344,
 98640,
 98585,
 99880,
 108079,
 90555,
 90545,
 98553,
 98572,
 98659,
 98706,
 108052,
 108097,
 100541,
 108771,
 2435,
 100595,
 108778,
 100419,
 108438,
 108835,
 106348,
 106486,
 90041]

In [6]:
create_collection(conn, 
                  ids, 
                  ke_extractor = keyword_extraction.extract_yake, 
                  num_keywords=15,
                  max_ngram=2, 
                  sentence_matcher = sentence_comparision.match_sentences_semantic_search,
                  importance_estimator = sentence_importance.text_rank_importance,
                  use_furthest=False, 
                  name_prefix="standard",
                  make_data_persistent=True, 
                  path="dataframes",
                  threshold=0.65,
                  stopwords=nltk.corpus.stopwords.words("english"),
                  combinator=utilities.alpha_combination,
                  gamma = 0.5,
                  num_splits=1,
                  is_policy=True,
                  matching_model ="msmarco-distilbert-base-v4")

100%|██████████| 25/25 [11:17<00:00, 27.11s/it]


Unnamed: 0,keyword,score
0,app,0.1324
1,mobile,0.057117
2,mobile app,0.057117
3,may,0.0378
4,certain,0.037578
5,additional,0.03115
6,choice,0.03115
7,within,0.03115
8,listed,0.028183
9,use,0.022545


In [7]:
from itertools import product


def cartesian_product(params):
    
    # gett all possible combinations
    return list(product(*params.values()))



# Parameters to Use

In [12]:
parameters = {"matcher": [sentence_comparision.match_sentences_semantic_search,
                         sentence_comparision.match_sentences_tfidf_weighted],
              
             "ie": [sentence_importance.text_rank_importance,
                    sentence_importance.yake_weighted_importance],
              
             "threshold": [0.5, 0.6, 0.7]
              
             }


In [13]:
len(cartesian_product(parameters))

12

In [None]:
def create_cartesian_collection(params, 
                                df,
                                sites, 
                                baseline_ke_extractor = keyword_extraction.extract_yake, 
                                num_keywords=10, 
                                max_ngram=2, 
                                use_furthest=False,
                                make_data_persistent=False,
                                file_prefix = "combination",
                                path="dataframes",
                                compare_k = 15):
            
        
    combinations = cartesian_product(params)
    
    number_of_combinations = len(combinations)
    
    count = 0
    
    for combination in combinations:
        
        matcher, threshold = combination
        
        print(f"Contrastive Keyword Extraction pipeline is being ran with combination {count}:")
        
        total_frame, intermediate_frame, baseline_frame, baseline_frame2 = create_collection(df = df, 
                                                                  sites = sites, 
                                                                  ke_extractor = baseline_ke_extractor, 
                                                                  num_keywords = num_keywords, 
                                                                  max_ngram = max_ngram, 
                                                                  sentence_matcher = matcher,
                                                                  use_furthest = use_furthest,
                                                                  name_prefix=f"{file_prefix}_{count}",
                                                                  make_data_persistent=make_data_persistent,
                                                                  path=path,
                                                                  threshold=threshold)
        
        
        baselines = ["baseline1", "baseline2"]
        for baseline in baselines:
            summary.extensive_summary(sites, 
                                      show_results=False, 
                                      k=compare_k,
                                      name_a = f"{file_prefix}_{count}_inter_keywords", 
                                      name_b = f"{file_prefix}_{count}_{baseline}_keywords",
                                      save_prefix=f"{file_prefix}_{count}_{baseline}_", 
                                      path=path)
        
        
        count += 1
        

In [None]:
med_ids =  [106750, 108123, 90555, 98640, 108026, 90344, 98553, 98659, 108079, 90536, 98572, 98706, 108120, 90545, 98585, 99880] 

In [None]:
create_cartesian_collection(parameters,
                            df,
                            small+med_ids,
                            baseline_ke_extractor = keyword_extraction.extract_yake,
                            num_keywords=20,
                            max_ngram=2,
                            use_furthest=True, # only compare the first and last document
                            make_data_persistent=True,
                            file_prefix="threshold",
                            path="Combinations")