In [2]:
############Bibliotheken###############
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request
from tensorflow.core.example import example_pb2
import struct
###########Code##############

# <s> and </s> are used in the data files to segment the abstracts into sentences. They don't receive vocab ids.

SENTENCE_START = '<s>'
SENTENCE_END = '</s>'
PAD_TOKEN = '[PAD]' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNKNOWN_TOKEN = '[UNK]' # This has a vocab id, which is used to represent out-of-vocabulary words
START_DECODING = '[START]' # This has a vocab id, which is used at the start of every decoder input sequence
STOP_DECODING = '[STOP]' # This has a vocab id, which is used at the end of untruncated target sequences

def abstract2sents(abstract):
  """
  Splits abstract text from datafile into list of sentences.
  Args:
    abstract: string containing <s> and </s> tags for starts and ends of sentences
  Returns:
    sents: List of sentence strings (no tags)
  """
  cur = 0
  sents = []
  while True:
    try:
      start_p = str(abstract).index(SENTENCE_START, cur)
      end_p = str(abstract).index(SENTENCE_END, start_p + 1)
      cur = end_p + len(SENTENCE_END)
      sents.append(abstract[start_p+len(SENTENCE_START):end_p])
    except ValueError as e: # no more sentences
      return sents

def text_generator(example_generator):
    """
        Generates article and abstract text from tf.Example.
        Args:
        example_generator: a generator of tf.Examples from file. See data.example_generator
    """
    while True:
      e = next(example_generator) # e is a tf.Example
      try:
        article_text = e.features.feature['article'].bytes_list.value[0] # the article text was saved under the key 'article' in the data files
        abstract_text = e.features.feature['abstract'].bytes_list.value[0] # the abstract text was saved under the key 'abstract' in the data files
      except ValueError:
        tf.logging.error('Failed to get article or abstract from example')
        continue
      if len(article_text)==0: # See https://github.com/abisee/pointer-generator/issues/1
        tf.logging.warning('Found an example with empty article text. Skipping it.')
      else:
        yield (article_text, abstract_text)

def example_generator(data_path):
    reader = open(data_path, 'rb')
    while True:
        len_bytes = reader.read(8)
        if not len_bytes: break # finished reading this file
        str_len = struct.unpack('q', len_bytes)[0]
        example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]  
        #print(example_str)
        yield example_pb2.Example.FromString(example_str)





In [3]:
import numpy as np
from rouge import Rouge
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

###################
x = text_generator(example_generator("2_abs.bin"))
(article, abstract) = next(x)
article = str(article)
abstract = str(abstract)
ArtAndAbs=article+abstract
###################
def create_universal_dict(x,fromTuple,untilTuple,q):
    article=[""]*untilTuple
    abstract=[""]*untilTuple
    x = text_generator(example_generator("2_abs.bin"))
    q=0
    dict_table=dict()
    for i in range(0,untilTuple):
        (article[i], abstract[i]) = next(x)
        article[i] = str(article[i])
        abstract[i] = str(abstract[i])
        ArtAndAbs=article[i]+abstract[i]
        dict_table,q = _create_dictionary_table(ArtAndAbs,dict_table,q)
    return dict_table,q,article,abstract,x



def _create_dictionary_table(text_string,dict_table,q) -> dict:
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    for wd in words:
        if wd in stop_words:
            continue
        if wd in dict_table:
            continue
        else:
            dict_table[wd] = q
            q+=1
    return dict_table,q

def _create_frequenzy_table(sentence,dict_table,list_of_keys) -> dict:
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(sentence)
    frequency_table=dict()
    for i in range(0,len(dict_table)):
        frequency_table[list_of_keys[i]]=0
        
    for wd in words:
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            #frequency_table[wd] = 1
            continue
    return list(frequency_table.values())



def prepare_X_Y(article,abstract,dict_table):
    sentencesArt=[]
    sentencesArt=sent_tokenize(article)
    sentencesAbs=[]
    sentencesAbs=sent_tokenize(abstract)
    ArtAndAbs=article+abstract
    r=Rouge()
    rscore=[]
    for c in range(0,len(sentencesArt)):
        rscore=rscore+r.get_scores(sentencesArt[c],abstract)
    X=np.arange(len(dict_table)*len(sentencesArt)).reshape(len(sentencesArt),len(dict_table))
    Y=[0]*len(sentencesArt)
    for i in range(0,len(sentencesArt)):
        X[i]=list(_create_frequenzy_table(sentencesArt[i],dict_table,list(dict_table.keys())))
    rscore=([r["rouge-1"]["r"] for r in rscore])
    for i in range(0,len(sentencesArt)):
        if rscore[i]>0.15:
            Y[i]=1
        else:
            Y[i]=0

    return X,Y,len(sentencesArt),

def train_dict_prepare_x_y_comb(fromTuple,untilTuple):
    q=0
    x = text_generator(example_generator("2_abs.bin"))
    dict_table,q,article,abstract,x=create_universal_dict(x,fromTuple,untilTuple,q)
    X,Y,length=[],[],[]
    
    for i in tqdm(range(fromTuple,untilTuple)):
        try:
            a,b,c=prepare_X_Y(article[i],abstract[i],dict_table)
            X.extend(a)
            Y.extend(b)
            length.append(c)
        except:
            continue
    return X,Y,length,article,abstract,dict_table,x

X,Y,length,article,abstract,dict_table,x=train_dict_prepare_x_y_comb(0,230)
R= RandomForestClassifier()
R.fit(X,Y)




100%|████████████████████████████████████████████████████████████████████████████████| 230/230 [00:40<00:00,  5.70it/s]


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
X,Y,c=prepare_X_Y(article[18],abstract[18],dict_table)
n=R.predict(X)
a=""
r=Rouge()
art=sent_tokenize(article[18])
for i in range(0,len(art)):
    if n[i]==1:
        a+=art[i]
print("RandomForest-Abstract")
print(a)
print()
print("CNN-Article")
print(article[18])

RandomForest-Abstract
b"-lrb- cnn -rrb- two years ago in a less turbulent time , mike premeau and kathy danke launched a small business , memories gourmet pizza co. , in their wisconsin town .but all that was threatened this week when people began mistaking them for memories pizza of walkerton , indiana , which made national headlines after its owners said they would refuse to cater a same-sex wedding .people posted angry comments on memories gourmet pizza 's facebook page and called its phone number to protest .and almost overnight , premeau and danke found themselves thrust unwillingly into a national debate over indiana 's controversial religious freedom restoration act .`` all of a sudden , our facebook page started getting flooded , '' premeau said .`` i do n't have much of a message , except that i hope your bigotry puts you out of business very quickly , '' a man said in a voice mail .indiana pizzeria finds itself at the center of ` religious freedom ' debate .premeau thinks the

In [40]:
(arti,abstra)=next(x)
arti=str(arti)
abstra=str(abstra)
artiSen=sent_tokenize(arti)
X=np.arange(len(dict_table)*len(artiSen)).reshape(len(artiSen),len(dict_table))
for i in range(0,len(artiSen)):
    X[i]=_create_frequenzy_table(artiSen[i],dict_table,list(dict_table.keys()))
a=""
n=R.predict(X)
print("ML-abstract")
for i in range(0,len(artiSen)):
    if n[i]==1:
        print(art[i])
        #print(r.get_scores(a,abstra))
            
print()
print("article")
print(arti)

ML-abstract

article
b"newport gwent dragons reached the european challenge cup semi-finals with a thrilling 25-21 victory over cardiff blues . tries from wing hallam amos , replacement nic cudd , a penalty try plus penalties from tom prydie and dorian jones and two prydie conversions brought the dragons back from a big deficit to victory . scrum-half lloyd williams , fly-half gareth anscombe and number eight josh navidi crossed for the blues , while anscombe added three conversions . dragons replacement nic cudd powers over for try against cardiff at rodney parade . the draw could not have set-up a more mouth-watering welsh clash than this encounter between the old rivals , with an away trip to either london irish or edinburgh the reward for the victors . cardiff held the upper hand historically , having won the challenge cup in 2010 when beating french outfit toulon 28-21 , while the furthest the dragons had gone in europe 's second tournament was the semi-final stage . the blues fie