In [36]:
import numpy as np
import pandas as pd
import json
from transformers import AutoModel, AutoTokenizer,BertModel
from summ import AnnotatedExtractiveSummarizer, NDCG
import os
import altair as alt
import sys
import random

In [25]:
sys.path.append(os.path.abspath(".."))

In [26]:
import summ

In [27]:
jsons_dir = 'Labeled'
labeled = os.listdir(jsons_dir)

In [28]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = BertModel.from_pretrained("dbmdz/bert-base-turkish-cased",output_hidden_states = True)

### Score With Percentage Summary

In [49]:
scores_per_perc = {
    "bert":[],
    "random":[],
}
unsuccess = []
#p = [0.05,.1,.15,.2,.25,.30,.35,.40,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,1.0]
p = [i/10 for i in range(1,11,1)]
print(p)
for perc in p:
    print(f'Summarizing {perc*100} %')
    scores = {
        "bert":[],
        "random":[],
    }
    for i,path in enumerate(labeled):
        
        jpath = f'{jsons_dir}/{path}'
        
        with open(jpath,'rb') as f: 
            j = json.load(f)
            
        f.close()
        
        len_text = len(j['sentences'])
        k = int(len_text * perc)
        
        ndcg_score = NDCG(k=k)
        
        ## BERT
        try:
        
            summarizer = AnnotatedExtractiveSummarizer(tokenizer,model,k=k,layers=[11],random_state=15,doEval=True,use_cuda=True)
            summary = summarizer.summarize(jpath)
            scores["bert"].append(summarizer.score())
        
        except:
            scores["bert"].append(np.nan)
            unsuccess.append(jpath)
            
        ## RANDOM
        labels = np.array([sentence['deletedInRound'] for sentence in j['sentences']]) ## scores for each sentence
        rand_selected_indices = random.sample(range(len_text), k)
        rand_score = ndcg_score(labels, rand_selected_indices)
        scores["random"].append(rand_score)
        
    
    for k in scores_per_perc.keys():
        scores_per_perc[k].append(scores[k])
    
    
    
    
    

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Summarizing 10.0 %
Summarizing 20.0 %
Summarizing 30.0 %
Summarizing 40.0 %
Summarizing 50.0 %
Summarizing 60.0 %
Summarizing 70.0 %
Summarizing 80.0 %
Summarizing 90.0 %
Summarizing 100.0 %


  return self.fit(X, sample_weight=sample_weight)._transform(X)
  return self.fit(X, sample_weight=sample_weight)._transform(X)


In [40]:
unsuccess

[]

In [85]:
bert_scores = np.array(scores_per_perc["bert"])
random_scores = np.array(scores_per_perc["random"])
percs_labels = []
for i,scores in enumerate(bert_scores):
    for _ in scores:
        percs_labels.append((i+1)/10)
for i,scores in enumerate(random_scores):
    for _ in scores:
        percs_labels.append((i+1)/10)


all_scores = np.concatenate((bert_scores.flatten(), random_scores.flatten()))
labels = ["bert"]*len(bert_scores.flatten()) + ["random"]*len(random_scores.flatten())

print(len(labels), len(percs_labels))

1940 1940


In [98]:
source = pd.DataFrame({'Summary Percentage':np.array(percs_labels)*100,
                       'scores':all_scores,
                       'labels':labels})
source

Unnamed: 0,Summary Percentage,scores,labels
0,10.0,0.708333,bert
1,10.0,0.583333,bert
2,10.0,0.714286,bert
3,10.0,0.928571,bert
4,10.0,0.500000,bert
...,...,...,...
1935,100.0,1.000000,random
1936,100.0,1.000000,random
1937,100.0,1.000000,random
1938,100.0,1.000000,random


In [124]:
# the base chart

#base = alt.Chart(source).transform_calculate(
#    ymin="datum.bert-datum.berterr",
#    ymax="datum.bert+datum.berterr"
#)


bert_bars = alt.Chart(source[source.labels == "bert"]).mark_errorbar(
    extent="stdev",
    ticks=True,
).encode(
    x="Summary Percentage",
    y=alt.Y("scores",title='N-DCG Score', scale=alt.Scale(domain=(0.4, 1))),
    color="labels:N",
)

rand_bars = alt.Chart(source[source.labels == "random"]).mark_errorbar(
    extent="stdev",
    ticks=True,
).encode(
    x="Summary Percentage",
    y=alt.Y("scores",title='N-DCG Score', scale=alt.Scale(domain=(0.4, 1))),
    color="labels:N",
)

# generate the points
points = alt.Chart(source).mark_point(
    size=50,
    filled=True,
).encode(
    x=alt.X('Summary Percentage', scale=alt.Scale(domain=(0, 100))),
    y=alt.Y('mean(scores):Q', scale=alt.Scale(domain=(0.4, 1))),
    color="labels:N",

)

# generate the error bars


bert_bars+rand_bars+points