In [1]:
import transformers
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
import nltk
import gradio as gr


# Pfad zur Wikipedia-Textdatei
dateipfad = r'..\task1\Case Dataset\Data files\NER_text_Wikipedia_crawl.txt'

# Datei einlesen
with open(dateipfad, 'r', encoding='utf-8') as file:
    text = file.read()

# Text in Sätze aufteilen
sentences = nltk.tokenize.sent_tokenize(text)

# Farbcodes für Entitätstypen
entity_colors = {
    "PER": ("yellow", "black"),       # Person
    "LOC": ("green", "white"),        # Ort
    "ORG": ("blue", "white"),         # Organisation
    "MISC": ("gray", "white"),        # Miscellaneous (Nationalitäten, Religionen, Produkte oder Ereignisse)
    "DATE": ("orange", "black"),      # Datum
    "TIME": ("purple", "white"),      # Zeit
    "MONEY": ("red", "white"),        # Geld
    "PERCENT": ("lightblue", "black"),# Prozent
    "QUANTITY": ("pink", "black"),    # Menge
    "LAW": ("lightgreen", "black"),   # Rechtliche Hinweise
    "LANGUAGE": ("beige", "black")    # Sprache
}

def get_entity_style(entity_type):
    return entity_colors.get(entity_type, ("white", "black"))

# NER-Modelle (https://huggingface.co/models?pipeline_tag=token-classification&language=en&sort=downloads&search=ner)
models = {
    "BERT Large (CoNLL-03 English)": "dbmdz/bert-large-cased-finetuned-conll03-english",
    "Wikineural Multilingual NER (Babelscape)": "Babelscape/wikineural-multilingual-ner",
    "BERT Large NER (dslim)": "dslim/bert-large-NER"
}

def load_model(model_name):
    if model_name in ["Babelscape/wikineural-multilingual-ner", "dslim/bert-large-NER"]:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForTokenClassification.from_pretrained(model_name)
        return pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    else:
        return pipeline("ner", model=model_name)

def ner_predict(sentence, model_name):
    ner_model = load_model(models[model_name])
    ner_results = ner_model(sentence)
    highlighted_text = sentence
    offset = 0

    # Text markieren
    for entity in ner_results:
        entity_type = entity.get('entity_group', entity.get('entity'))
        start = entity['start'] + offset
        end = entity['end'] + offset
        entity_text = highlighted_text[start:end]
        bg_color, text_color = get_entity_style(entity_type)
        highlighted_text = (highlighted_text[:start] +
                            f"<mark style='background-color: {bg_color}; color: {text_color}'>{entity_text}</mark>" +
                            highlighted_text[end:])
        offset += len(f"<mark style='background-color: {bg_color}; color: {text_color}'>{entity_text}</mark>") - len(entity_text)

    return highlighted_text + legend_html

legend_html = """
<div style='margin-top: 20px;'>
    <strong>Legende:</strong><br>
    <mark style='background-color: yellow; color: black;'>Person</mark>
    <mark style='background-color: green; color: white;'>Ort</mark>
    <mark style='background-color: blue; color: white;'>Organisation</mark>
    <mark style='background-color: gray; color: white;'>Miscellaneous</mark>
    <mark style='background-color: orange; color: black;'>Datum</mark>
    <mark style='background-color: purple; color: white;'>Zeit</mark>
    <mark style='background-color: red; color: white;'>Geld</mark>
    <mark style='background-color: lightblue; color: black;'>Prozent</mark>
    <mark style='background-color: pink; color: black;'>Menge</mark>
    <mark style='background-color: lightgreen; color: black;'>Rechtliche Hinweise</mark>
    <mark style='background-color: beige; color: black;'>Sprache</mark>
</div>
"""

# Gradio Interface
demo = gr.Interface(
    fn=ner_predict,
    inputs=[
        gr.Dropdown(choices=sentences, label="Wähle einen Satz"),
        gr.Dropdown(choices=list(models.keys()), label="Wähle ein NER-Modell")
    ],
    outputs=gr.HTML(label="NER-Ergebnisse"),
    title="Named Entity Recognition (NER)",
    description="Wähle einen Satz und ein Modell aus den Dropdown-Listen, um die benannten Entitäten hervorzuheben."
)

# Starte die Gradio-App
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [2]:
import pandas as pd

# Pfad zur CSV-Datei
dateipfad = r'..\task1\Case Dataset\Data files\Translation_Training.csv'

# CSV-Datei laden
translation_data = pd.read_csv(dateipfad, delimiter=';')

# Anzeigen der ersten Zeilen
translation_data.head()

Unnamed: 0,id,split,en_US,de_DE,es_ES,fr_FR,it_IT
0,1847,train,order me a cheese burger from tommy's burgers,bestell mir einen cheeseburger von tommy's bur...,pídeme una hamburguesa de queso del mcdonalds,commande moi un burger au fromage chez tommy's...,ordinami un cheese burger da america graffiti
1,876,train,play kari jobe for me,spiel kari jobe für mich,pon melendi para mi,mets jacques brel ne me quitte pas,metti laura pausini per me
2,14494,train,what is i. b. m.'s stock worth,was ist i. b. m.'s aktie wert,cuál es el valor de las acciones del ibm,quelle est la valeur des actions d'i. b. m.,qual è il valore delle azioni generali
3,14366,train,will it be good to buy nike stock today,wäre es gut heute volkswagen aktien zu kaufen,será bueno comprar acciones de nike hoy dia,sera-t-il bon d'acheter des actions nike aujou...,oggi è un buon giorno per comprare le azioni d...
4,1977,train,please remove the alarm which i set for today ...,bitte lösche den wecker den ich für heute früh...,por favor borrar la alarma que tenía activada ...,veuillez retirer l'alarme que j'ai réglée pour...,rimuovi la sveglia impostata per questa mattina


In [2]:
#!pip install langdetect #https://www.edenai.co/post/top-free-language-detection-tools-apis-and-open-source-models

In [3]:
#!pip install sentencepiece

In [3]:
import gradio as gr
from transformers import pipeline
from langdetect import detect
import itertools
from random import choice

In [4]:
# Liste der Sprachen gemäß .csv-Datei
languages = ['en', 'de', 'es', 'fr', 'it']
models = ['Helsinki', 'Facebook']
language_pairs = list(itertools.permutations(languages, 2))

translators = {}

# Übersetzungsmodell laden, basierend auf Eingaben für Quellsprache (src), Zielsprache (dest) und Modell (model_choice).
def load_model(src, dest, model_choice):
    if model_choice == 'Helsinki':
        model_name = f'Helsinki-NLP/opus-mt-{src}-{dest}'
        translator = pipeline('translation', model=model_name)
    elif model_choice == 'Facebook':
        model_name = 'facebook/m2m100_418M'
        translator = pipeline('translation', model=model_name, 
                              tokenizer=model_name, src_lang=src, tgt_lang=dest)
    else:
        return None, f"Kein Modell verfügbar für die Kombination {src}-{dest} mit {model_choice}"

    print(f"Modell geladen: {model_name}")
    return translator, None

#Verwendung des ausgewählten Modells zu übersetzen
def translate(text, src, dest, model_choice):
    model_key = (src, dest, model_choice)
    if model_key not in translators:
        translator, error = load_model(src, dest, model_choice)
        if error:
            return error
        translators[model_key] = translator

    try:
        translation = translators[model_key](text)[0]['translation_text']
        return translation
    except Exception as e:
        return f"Fehler bei der Übersetzung: {e}"

# Gradio Interface
demo = gr.Interface(
    fn=lambda text, dest, model_choice: translate(text, detect(text), dest, model_choice),
    inputs=[
        gr.Textbox(lines=2, placeholder="Hier Text eingeben..."), 
        gr.Dropdown(choices=languages, label="Zielsprache auswählen"),
        gr.Dropdown(choices=models, label="Modell auswählen")
    ],
    outputs="text"
)

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Modell geladen: Helsinki-NLP/opus-mt-en-de
Modell geladen: facebook/m2m100_418M


In [1]:
import pandas as pd

# Pfad zur CSV-Datei
dateipfad = r'..\task1\Case Dataset\Data files\Summarization_Training.csv'

# CSV-Datei laden
summarization_data = pd.read_csv(dateipfad, delimiter=';')

# Anzeigen der ersten Zeilen
summarization_data.head()

Unnamed: 0,article,highlights
0,"By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 ...",John and .\nAudrey Cook were discovered alongs...
1,UNITED NATIONS (CNN) -- A rare meeting of U.N....,NEW: Libya can serve as example of cooperation...
2,Cover-up: Former Archbishop Lord Hope allowed ...,Very Reverend Robert Waddington sexually abuse...
3,"By . Kristie Lau . PUBLISHED: . 10:48 EST, 14 ...",Monday night's episode showed Buddy Valastro t...
4,'The lamps are going out all over Europe. We s...,People asked to turn out lights for hour betwe...


In [2]:
import gradio as gr
from transformers import pipeline

# Datei laden
file_path = r'..\task1\Case Dataset\Data files\Summarization_Training.csv'
data = pd.read_csv(file_path, delimiter=';')

# Artikeltexte extrahieren
articles = data['article'].tolist()

# Modelle
model_options = {
    "BART Large CNN": "facebook/bart-large-cnn",
    "PEGASUS Newsroom": "google/pegasus-newsroom",
    "BART Large CNN SAMSum": "philschmid/bart-large-cnn-samsum"
}

# Initialisierung des Modells
def load_model(model_choice):
    try:
        model_name = model_options[model_choice]
        summarizer = pipeline("summarization", model=model_name)
        print(f"Modell geladen: {model_name}")
        return summarizer
    except Exception as e:
        return None, str(e)


# Zusammenfassung ausführen
def summarize_article(article_index, model_choice):
    result = load_model(model_choice)
    if isinstance(result, tuple): 
        summarizer, error = result
        return error

    summarizer = result

    # Artikeltext holen
    article_text = articles[int(article_index)]
    
    # Zusammenfassung erstellen
    summary = summarizer(article_text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']
    
# Dropdown zur Auswahl eines Artikels
indices = [str(i) for i in range(len(articles))]

# Gradio-Benutzeroberfläche einrichten
demo = gr.Interface(
    fn=summarize_article,
    inputs=[
        gr.Dropdown(choices=indices, label="Wähle einen Artikel anhand des Indexes"),
        gr.Dropdown(choices=list(model_options.keys()), label="Wähle ein Modell")
    ],
    outputs=[gr.Textbox(label="Zusammengefasster Artikel")]
)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-newsroom and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (673 > 512). Running this sequence through the model will result in indexing errors


Modell geladen: google/pegasus-newsroom


Traceback (most recent call last):
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\gradio\queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\gradio\route_utils.py", line 270, in call_process_api
    output = await app.get_blocks().process_api(
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\gradio\blocks.py", line 1887, in process_api
    result = await self.call_function(
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\gradio\blocks.py", line 1472, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\anyio\to_thread.py", line 28, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(func, *args, cancellable=cancellable,
  File "C:\Users\papad\anaconda3\envs\datascience\lib\site-packages\anyio\_backends\_asyncio.py", line 8

Modell geladen: philschmid/bart-large-cnn-samsum
