In [11]:
import numpy as np
import pandas as pd
import json
from transformers import AutoModel, AutoTokenizer,BertModel
from summ import AnnotatedExtractiveSummarizer
import os
import altair as alt

In [12]:
jsons_dir = 'Labeled'
labeled = os.listdir(jsons_dir)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
model = BertModel.from_pretrained("dbmdz/bert-base-turkish-cased",output_hidden_states = True)

In [14]:
lens = []
for k in range(1,21):
    print(f'Summary Length {k}')
    scores = []
    for i,path in enumerate(labeled):
        summarizer = AnnotatedExtractiveSummarizer(tokenizer,model,k=k,layers=[2,3,8],random_state=15,doEval=True)
        jpath = f'{jsons_dir}/{path}'
        summary = summarizer.summarize(jpath)
        scores.append(summarizer.score())
    lens.append(scores)

Summary Length 1
Summary Length 2
Summary Length 3
Summary Length 4
Summary Length 5
Summary Length 6
Summary Length 7
Summary Length 8
Summary Length 9
Summary Length 10
Summary Length 11
Summary Length 12
Summary Length 13
Summary Length 14
Summary Length 15
Summary Length 16
Summary Length 17
Summary Length 18
Summary Length 19
Summary Length 20


In [15]:
all_scores_k = np.array(lens)

In [16]:
all_scores_k.shape

(20, 97)

In [17]:
source_k = pd.DataFrame({'k':[i for i in range(1,all_scores_k.shape[0]+1)],'y':np.mean(all_scores_k,axis=1),'yerr':np.std(all_scores_k,axis=1)})

In [18]:
# the base chart
base = alt.Chart(source_k).transform_calculate(
    ymin="datum.y-datum.yerr",
    ymax="datum.y+datum.yerr"
)

# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='blue'
).encode(
    x=alt.X('k', scale=alt.Scale(domain=(0, 21))),
    y=alt.Y('y', scale=alt.Scale(domain=(0.2, 1)))
)

# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("k",title='# of Summary Sentences'),
    y=alt.Y("ymin:Q",title = 'N-DCG Score'),
    y2="ymax:Q"
)

points + errorbars

### Score With Percentage Summary

In [27]:
percs = []
unsuccess = []
p = [0.05,.1,.15,.2,.25,.30,.35,.40,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,1.0]
for perc in p:
    print(f'Summarizing {perc*100} %')
    scores = []
    for i,path in enumerate(labeled):
        
        jpath = f'{jsons_dir}/{path}'
        
        with open(jpath,'rb') as f: 
            j = json.load(f)
            
        f.close()
        
        len_text = len(j['sentences'])
        
        k = int(len_text * perc)
        
        try:
        
            summarizer = AnnotatedExtractiveSummarizer(tokenizer,model,k=k,layers=[2,3,8],random_state=15,doEval=True)
            summary = summarizer.summarize(jpath)
            scores.append(summarizer.score())
        
        except:
            scores.append(np.nan)
            unsuccess.append(jpath)
            
    percs.append(scores)
    
    
    
    
    

Summarizing 5.0 %
Summarizing 10.0 %
Summarizing 15.0 %
Summarizing 20.0 %
Summarizing 25.0 %
Summarizing 30.0 %
Summarizing 35.0 %
Summarizing 40.0 %
Summarizing 45.0 %
Summarizing 50.0 %
Summarizing 55.00000000000001 %
Summarizing 60.0 %
Summarizing 65.0 %
Summarizing 70.0 %
Summarizing 75.0 %
Summarizing 80.0 %
Summarizing 85.0 %
Summarizing 90.0 %
Summarizing 95.0 %


  return self.fit(X, sample_weight=sample_weight)._transform(X)


Summarizing 100.0 %


  return self.fit(X, sample_weight=sample_weight)._transform(X)
  return self.fit(X, sample_weight=sample_weight)._transform(X)


In [28]:
unsuccess

[]

In [29]:
all_scores = np.array(percs)

In [33]:
source = pd.DataFrame({'Summary Percentage':100*np.array(p),'y':np.mean(all_scores,axis=1),'yerr':np.std(all_scores,axis=1)})

In [36]:
# the base chart
base = alt.Chart(source).transform_calculate(
    ymin="datum.y-datum.yerr",
    ymax="datum.y+datum.yerr"
)
errorbars = base.mark_errorbar().encode(
    x="Summary Percentage",
    y=alt.Y("ymin:Q",title='N-DCG Score'),
    y2="ymax:Q"
)
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='blue'
).encode(
    x=alt.X('Summary Percentage', scale=alt.Scale(domain=(0, 100))),
    y=alt.Y('y', scale=alt.Scale(domain=(0.2, 1)))
)

# generate the error bars


points + errorbars