In [4]:
import numpy as np
import re
import networkx as nx
import spacy
from summa import keywords
from summa.summarizer import summarize

# Laden des Spacy-Modells
import evaluate
import nltk
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSeq2SeqLM, PegasusForConditionalGeneration, PegasusTokenizer, AutoTokenizer
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import math
from transformers import PreTrainedTokenizerFast
from tqdm import tqdm
import random
from transformers import pipeline
import torch

In [5]:

def reduce_repetitions(text):
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\!{2,}', '!', text)
    text = re.sub(r'\,{2,}', ',', text)
    text = re.sub(r'\;{2,}', ';', text)
    return text




def textrank_extractive(text, compression_rate=0.5,split='\. '):
    # Tokenisierung
    nlp = spacy.load("en_core_web_lg")
    #doc = nlp(text.replace("\n\n", " "))

    # Split the text at each ". " that is not followed by a single letter
    doc = re.split(fr'(?<!\b\w\w){split}', reduce_repetitions(re.sub(' +', ' ', text.replace("\n", " ").replace('-',' ').replace('_',' ').replace("\'", "").replace("!", ".").replace("?", ".").replace(";", ""))))
    sentences = [sent for sent in doc if len(sent.replace("-", " ").split()) > 2]

    # Speichern der Spacy-Dokumente der Sätze für spätere Verwendung
    sentence_docs = [nlp(sentence) for sentence in sentences]

    # Extrahiere Schlüsselsätze mit TextRank
    num_sentences = max(1, int(len(sentences) * compression_rate))
    extracted_sentences = summarize(text, words=num_sentences, split=True)

    # Erzeuge eine Matrix mit den Ähnlichkeiten zwischen den Sätzen
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for i, doc_i in enumerate(sentence_docs):
        for j, doc_j in enumerate(sentence_docs):
            similarity = similarity_function(doc_i, doc_j)
            similarity_matrix[i, j] = similarity
    
    # Konstruiere einen Graphen mit den Sätzen als Knoten und den Ähnlichkeiten als Kanten
    graph = nx.from_numpy_array(similarity_matrix)

    # Berechne den TextRank-Score für jeden Satz
    scores = nx.pagerank_numpy(graph)

    # Wähle die besten Sätze basierend auf ihren TextRank-Scores aus
    top_sentences = sorted(scores, key=scores.get, reverse=True)[:num_sentences]

    # Sortiere die ausgewählten Sätze nach ihrer Position im Text
    top_sentences = sorted(top_sentences)

    # Gebe die extrahierten Schlüsselsätze zurück
    extracted_sentences = [sentences[index] for index in top_sentences]
    return extracted_sentences


def similarity_function(doc1, doc2):
    # Berechne die Cosinus-Ähnlichkeit zwischen den beiden Dokumenten
    similarity = doc1.similarity(doc2)
    return similarity




def compression_ratio(text, summary):
    # Berechne das Verhältnis der Anzahl der Wörter in der Zusammenfassung zur Anzahl der Wörter im Ausgangstext
    num_words_text = len(text.split())
    num_words_summary = len(summary.split())
    ratio = num_words_summary / num_words_text
    return ratio



def compression(text, compression_rate,split='\. '):
    max_iterations = 20
    iterations = 0
    #compression_rate -= 0.05
    
    extracted = textrank_extractive(text, compression_rate,split)
    summary = '. '.join(extracted)
    compression_rate_renwed = compression_rate


    while compression_ratio(text, summary) < compression_rate and iterations < max_iterations:
        iterations += 1
        compression_rate_renwed += 0.05
        if compression_rate_renwed > 1:
            compression_rate_renwed = 1
        extracted = textrank_extractive(text, compression_rate=compression_rate_renwed)
        summary = '. '.join(extracted)
    return summary




In [185]:
def token_count(text):
    tokens = text.split()
    return len(tokens)


def adjust_length(text):
    length = token_count(text)
    if length <20:
        min_length = length + int(length * 0.05)
        max_length = min_length +min_length
    elif length <50:
        min_length = length + int(length * 0.05)
        max_length = min_length +min_length* 0.5
    elif length <60:
        min_length = length + int(length * 0.05)
        max_length = min_length +min_length* 0.4
    elif length < 80:
        min_length = length + int(length * 0.05)
        max_length = min_length + min_length* 0.25
    elif length < 100:
        min_length = length + int(length * 0.3)
        max_length = min_length + 100
    else:
        min_length = math.ceil(length / 50) * 70
        max_length = min_length + 100
    return min_length, max_length

In [176]:
def batch_sent(sentenc,splitt=180,split='\. '):
    
    sentences = re.split(fr'(?<!\b\w\w){split}', sentenc.lower())

    # Erstellen Sie Batches von Sätzen, die weniger als 1024 Tokens enthalten
    batches = []
    batch = []
    batch_len = 0
    for sentence in sentences:
        
        sentence_len = len(tokenizer.tokenize(sentence))
        if sentence_len + batch_len > splitt:
            if sentence_len < splitt:  # überspringen Sie Sätze, die länger als 256 Tokens sind
                batches.append(batch)
                batch = [sentence]
                batch_len = sentence_len
            # wenn ein Satz alleine 1024 Tokens überschreitet, wird er übersprungen
        else:
            batch.append(sentence)
            batch_len += sentence_len
    batches.append(batch)
    return batches

In [177]:
def text_rank_algo(dictionary,komp='compression',split='\. ',random_T=True,column='text'):
   
    text = dictionary[column].replace("\n", " ")
    if random_T:
        random_value = dictionary[komp]  # Zufälliger Wert zwischen 0.2 und 0.8 auf zwei Stellen nach dem Komma begrenzt
    else:
        if dictionary['reduction_multiplier']<0.8:
            random_value=dictionary['desired_compression_rate']
        elif dictionary['reduction_multiplier']<0.9:
            random_value=dictionary['reduction_multiplier']
        else:
            random_value=1
    text_rank_text= compression(text.replace("\n\n", " "),random_value,split)
    compression_ratio_value=compression_ratio(text, compression(text,random_value,split))
    text=re.sub(' +', ' ', text.replace("\n", " ").replace('-',' ').replace('_',' ').replace("\'", "").replace("!", ".").replace("?", ".").replace(";", ""))
    text_rank_text=re.sub(' +', ' ', text_rank_text.replace("\n", " ").replace('-',' ').replace('_',' ').replace("\'", "").replace("!", ".").replace("?", ".").replace(";", ""))
    if random_T:
        dictionary['text'] = text
        dictionary['text_rank_text'] = text_rank_text
        dictionary['tokens_gesamt'] = len(text.split(' '))
        dictionary['token_text_rank'] = len(text_rank_text.split(' '))
        dictionary['desired_compression_rate'] = random_value
        dictionary['text_rank_compression_rate'] = compression_ratio_value
    else:
        dictionary['text_rank_text_2'] = text_rank_text
        dictionary['tokens_gesamt_2'] = len(text.split(' '))
        dictionary['token_text_rank_2'] = len(text_rank_text.split(' '))
        dictionary['desired_compression_rate_2'] = random_value
        dictionary['text_rank_compression_rate_2'] = compression_ratio_value
                
    return dictionary

In [178]:
def check_class_and_get_model_name(input_dict, class_key):
    class_value = input_dict.get(class_key)

    if class_value is None:
        raise ValueError(f"'{class_key}' nicht im Eingabedictionary gefunden")

    if class_value == 'Scientific':
        model_name = 'NICFRU/bart-base-paraphrasing-science'
    elif class_value == 'news':
        model_name = 'NICFRU/bart-base-paraphrasing-news'
    elif class_value == 'story':
        model_name = 'NICFRU/bart-base-paraphrasing-story'
    elif class_value == 'reviews':
        model_name = 'NICFRU/bart-base-paraphrasing-review'
    else:
        return False
    
    return model_name

In [179]:
def create_model(dictionary):
    model_name=check_class_and_get_model_name(dictionary, 'classification')
    global summarizer
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    summarizer = pipeline("text2text-generation", model=model_name)
    

In [180]:
def paraphrase_of_text(dictionary,text_name='text',komp_name='reduction_multiplier',split='\. '):
    text_gesamt_list=[]
    batch_text_list=[]
    
    output_text_list=[]
    text=dictionary[text_name]
    komp = dictionary[komp_name]
    for batch in tqdm(batch_sent(text,split=split), desc='Verarbeite Batches'):
            # Zusammenfügen der Sätze in einem Batch
            #print(batch)
            if len(batch):
                #print('Läuft')
                batch_text = '. '.join(batch)
                batch_text += "."
                batch_text_list.append(batch_text)
                min_length_test, max_length_test = adjust_length(batch_text)
                ext_summary=summarizer(batch_text, max_length=int(round(max_length_test*komp,0)), min_length=int(round(min_length_test*komp,0)),length_penalty=100,num_beams=2)
            # Erstellen Sie einen read_csv für die aktuellen Ergebnisse

            text_gesamt_list.append(ext_summary[0]['generated_text'])
         
    text_gesamt = '. '.join(text_gesamt_list)
    actual_compression_rate = len(text_gesamt.split(' '))/len(text.split(' '))*100
    dictionary['Zusammenfassung'] = text_gesamt
    dictionary['Endgueltige_Kompressionsrate'] = actual_compression_rate
    dictionary['länge Zusammenfassung'] = len(text_gesamt.split(' '))
    dictionary['länge Ausgangstext'] = len(text.split(' '))
    dictionary['batch_texts'] = batch_text_list
    dictionary['batch_output'] = text_gesamt_list
    return dictionary
    

In [168]:
def calculate_compression(input_dict, total_tokens_col, current_tokens_col, desired_compression_rate):
    input_dict['current_compression_rate'] = input_dict[current_tokens_col] / input_dict[total_tokens_col]
    input_dict['compression_difference'] = input_dict[desired_compression_rate] - input_dict['current_compression_rate']
    input_dict['reduction_multiplier'] = input_dict[desired_compression_rate] / input_dict['current_compression_rate']
    return input_dict

In [169]:
def execute_text_gen(dictionary,split='\. ',seed=10):
    dictionary_copy=text_rank_algo(dictionary,split=split)
    dictionary_copy=calculate_compression(dictionary_copy, 'tokens_gesamt', 'token_text_rank', 'desired_compression_rate')
    create_model(dictionary_copy)
    dictionary_copy=paraphrase_of_text(dictionary_copy,text_name='text_rank_text',split=split)
    dictionary_copy['ent_com_rate']=dictionary_copy['länge Zusammenfassung'] / dictionary_copy['tokens_gesamt']
    dictionary_copy=calculate_compression(dictionary_copy, 'tokens_gesamt', 'länge Zusammenfassung', 'desired_compression_rate')
    dictionary_copy=text_rank_algo(dictionary_copy,random_T=False,column = 'Zusammenfassung')
    dictionary_copy['ent_com_rate']=dictionary_copy['länge Zusammenfassung'] / dictionary_copy['tokens_gesamt']
    return dictionary_copy

In [170]:
df_test=pd.read_csv('data/data_test.csv')

In [181]:
test_dict=df_test[df_test.classification=='news'].reset_index(drop=True)[['classification','text']].reset_index(drop=True)[['classification', 'text']][0:1].to_dict('records')[0]


In [182]:
test_dict['compression'] = 0.54

In [183]:
test_dict

{'classification': 'news',
 'text': 'RJD Chief Lalu Prasad Yadav on Wednesday said that Bihar Chief Minister Nitish Kumar has not asked for his son Tejashwi Yadav\'s resignation after the CBI raided him over corruption allegations. "We will not tolerate any disrespect towards him (Nitish Kumar). We have formed the grand alliance, made Nitish CM. Why will we break the alliance," Lalu said.',
 'compression': 0.54}

In [184]:
test=execute_text_gen(test_dict,split='\. ',seed=10)
test

  scores = nx.pagerank_numpy(graph)
NetworkX version 3.0.
  M = google_matrix(
Verarbeite Batches: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


{'classification': 'news',
 'text': 'RJD Chief Lalu Prasad Yadav on Wednesday said that Bihar Chief Minister Nitish Kumar has not asked for his son Tejashwi Yadavs resignation after the CBI raided him over corruption allegations. "We will not tolerate any disrespect towards him (Nitish Kumar). We have formed the grand alliance, made Nitish CM. Why will we break the alliance," Lalu said.',
 'compression': 0.54,
 'text_rank_text': 'RJD Chief Lalu Prasad Yadav on Wednesday said that Bihar Chief Minister Nitish Kumar has not asked for his son Tejashwi Yadavs resignation after the CBI raided him over corruption allegations. "We will not tolerate any disrespect towards him (Nitish Kumar). We have formed the grand alliance, made Nitish CM. Why will we break the alliance," Lalu said.',
 'tokens_gesamt': 58,
 'token_text_rank': 58,
 'desired_compression_rate': 0.54,
 'text_rank_compression_rate': 1.0,
 'current_compression_rate': 0.3793103448275862,
 'compression_difference': 0.1606896551724138