In [1]:
# Imports
import pandas as pd
import gradio as gr
from transformers import pipeline, AutoTokenizer

In [2]:
# Data for Named Entity Recognition
with open('Case Dataset/Data files/NER_text_Wikipedia_crawl.txt', 'r') as file:
    named_entity_recognition_data = file.read()

In [3]:
import re

def map_sentences_to_descriptions(file):

    # Basic sentence splitting using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', file)
    
    # Provide a descriptive word for each sentence
    descriptions = [
        "Introduction", "Importance", "Frequency", "Inspiration", "Founding",
        "Governance", "Evolution", "Adjustments", "Endorsements", "Adaptation",
        "Changes", "Media", "Cancellations", "Components", "Responsibilities",
        "Programme", "Symbols", "Participation", "Awards", "Growth",
        "Challenges", "Exposure", "Showcase", "Ancient", "Legend",
        "Myth", "Tradition", "Religious", "Decline", "Revival", "Modern",
        "Forerunners", "National", "Festival", "Event", "Reconstruction",
        "Historic", "Legacy", "Promotion", "Development", "Re-establishment",
        "Success", "Athens", "Paris", "Stagnation", "Survival", "Rebounding",
        "Popularity", "Winter", "Figure Skating", "Ice Hockey", "Expansion",
        "Congress", "Host", "International", "Agreement", "Official", "Youth",
        "Games", "Opportunities", "Nonprofit", "Effect", "Displacement",
        "Business", "Infrastructure", "Sponsorship", "Revenue", "Expenditures",
        "Financial", "Profit", "Exclusivity", "Marketing", "Broadcasting",
        "Audience", "Television", "Viewership", "Commercialisation",
        "Economic", "Investment", "International", "Symbolism", "Flag",
        "Motto", "Flame", "Mascot", "Ceremony", "Parade", "Athletes",
        "Hosts", "Medals", "Victory", "Events", "Governing", "Demonstration",
        "Recognized", "Professionalism", "Controversy", "Amateurism",
        "Participation", "Boycotts", "Politics", "Protest", "Doping", "Scandal",
        "Testing", "Banning", "Citizenship", "Medallists", "Athletes",
        "Nations", "Hosting"
    ]

    # Ensure the descriptions list matches the number of sentences
    # If there are more sentences, repeat the last description
    while len(descriptions) < len(sentences):
        descriptions.append(descriptions[-1])

    # Create a dictionary mapping sentences to descriptions
    sentence_description_map = {description: sentence.strip() for description, sentence in zip(descriptions, sentences)}

    return sentence_description_map

named_entity_recognition_data = map_sentences_to_descriptions(named_entity_recognition_data)

In [4]:
named_entity_recognition_data

{'Introduction': 'The modern Olympic Games or Olympics (French: Jeux olympiques)[a][1] are the leading international sporting events featuring summer and winter sports competitions in which thousands of athletes from around the world participate in a variety of competitions.',
 'Importance': "The Olympic Games are considered the world's foremost sports competition with more than 200 teams, representing sovereign states and territories participating; by default the Games substitute for any World Championships the year in which they take place (however, each class maintains their own records).[2] The Olympic Games are normally held every four years, and since 1994, have alternated between the Summer and Winter Olympics every two years during the four-year period.",
 'Frequency': 'Their creation was inspired by the ancient Olympic Games (Ancient Greek: Ὀλυμπιακοί Ἀγῶνες), held in Olympia, Greece from the 8th century BC to the 4th century AD.',
 'Inspiration': 'Baron Pierre de Coubertin fo

In [5]:
# Data for Translation
translation_data = pd.read_csv('Case Dataset/Data files/Translation_Training.csv', sep=';')

In [6]:
# adjust the column names from e.g. en_US to en
translation_data.columns = [column.split('_')[0] for column in translation_data.columns]

In [7]:
translation_data

Unnamed: 0,id,split,en,de,es,fr,it
0,1847,train,order me a cheese burger from tommy's burgers,bestell mir einen cheeseburger von tommy's bur...,pídeme una hamburguesa de queso del mcdonalds,commande moi un burger au fromage chez tommy's...,ordinami un cheese burger da america graffiti
1,876,train,play kari jobe for me,spiel kari jobe für mich,pon melendi para mi,mets jacques brel ne me quitte pas,metti laura pausini per me
2,14494,train,what is i. b. m.'s stock worth,was ist i. b. m.'s aktie wert,cuál es el valor de las acciones del ibm,quelle est la valeur des actions d'i. b. m.,qual è il valore delle azioni generali
3,14366,train,will it be good to buy nike stock today,wäre es gut heute volkswagen aktien zu kaufen,será bueno comprar acciones de nike hoy dia,sera-t-il bon d'acheter des actions nike aujou...,oggi è un buon giorno per comprare le azioni d...
4,1977,train,please remove the alarm which i set for today ...,bitte lösche den wecker den ich für heute früh...,por favor borrar la alarma que tenía activada ...,veuillez retirer l'alarme que j'ai réglée pour...,rimuovi la sveglia impostata per questa mattina
...,...,...,...,...,...,...,...
9995,4897,train,the lights are too bright,die lichter sind zu hell,las luces están muy brillantes,les lumières sont trop vives,le luci sono troppo intense
9996,16484,train,create new email address,erstelle eine neue email adeesse,crear una nueva dirección de correo electrónico,créer une nouvelle adresse e-mail,crea nuovo indirizzo email
9997,8087,train,can i remove a reminder,kann ich die erinnerung löschen,puedo quitar un recordatorio,puis-je supprimer un rappel,posso rimuovere un promemoria
9998,8090,train,what time is my appointment on saturday date,mein termin samstag datum,a que hora es mi cita el sábado,à quelle heure est mon rendez-vous pour samedi,a che ora è il mio appuntamento di sabato


In [8]:
# Data for Summarization
summarization_data = pd.read_csv('Case Dataset/Data files/Summarization_Training.csv', sep=';')

In [9]:
summarization_data

Unnamed: 0,article,highlights
0,"By . Anthony Bond . PUBLISHED: . 07:03 EST, 2 ...",John and .\nAudrey Cook were discovered alongs...
1,UNITED NATIONS (CNN) -- A rare meeting of U.N....,NEW: Libya can serve as example of cooperation...
2,Cover-up: Former Archbishop Lord Hope allowed ...,Very Reverend Robert Waddington sexually abuse...
3,"By . Kristie Lau . PUBLISHED: . 10:48 EST, 14 ...",Monday night's episode showed Buddy Valastro t...
4,'The lamps are going out all over Europe. We s...,People asked to turn out lights for hour betwe...
...,...,...
4495,Martin O'Neill has told the Tartan Army that b...,Scotland boss Gordon Strachan said it was okay...
4496,By . Tim Shipman . and Peter Campbell . PUBLIS...,Two companies expected to apply for extensions...
4497,By . Ellie Zolfagharifard . PUBLISHED: . 08:59...,James Wannerton spent 49 years creating the ma...
4498,By . Jack Crone for MailOnline . A school nurs...,"Donna Moore met Helena Farrell, 15, three time..."


In [10]:
# Named Entitiy Recognition
NAMED_ENTITY_RECOGNITION_MODELS = {
    "BERT (CoNLL-03 English)": "dbmdz/bert-large-cased-finetuned-conll03-english",
    "Wikineural Multilingual (Babelscape)": "Babelscape/wikineural-multilingual-ner",
    "BERT (dslim)": "dslim/bert-large-NER",
    "Biomedical (d4data)": "d4data/biomedical-ner-all"
}

TOKENIZER = {
    "BERT (CoNLL-03 English)": "google-bert/bert-base-cased",
    "Wikineural Multilingual (Babelscape)": "Babelscape/wikineural-multilingual-ner",
    "BERT (dslim)": "google-bert/bert-base-cased",
    "Biomedical (d4data)": "d4data/biomedical-ner-all"
}

ENTITY_DESCRIPTIONS = {
    "I-PER": "Person",
    "I-ORG": "Organization",
    "I-LOC": "Location",
    "I-MISC": "Miscellaneous",
    "B-PER": "Person",
    "B-ORG": "Organization",
    "B-LOC": "Location",
    "B-MISC": "Miscellaneous"
}

In [11]:
# Models for Summarization
SUMMARIZATION_MODELS = {
    "BART (CNN/DailyMail)": "facebook/bart-large-cnn",
    "T5 (CNN/DailyMail)": "t5-large",
    "Pegasus (Newsroom)": "google/pegasus-newsroom",
    "BART (XSum)": "facebook/bart-large-xsum",
    "T5 (XSum)": "t5-large",
    "Pegasus (XSum)": "google/pegasus-xsum"
}

In [12]:
# Common functions  
def input_preprocessing(selected_input: str = None, text: str = None, sample: str = None, file: str = None):
    if selected_input == "text":
        retval = text
    elif selected_input == "sample":
        retval = named_entity_recognition_data[sample] if sample in named_entity_recognition_data else sample
    elif selected_input == "file":
        with open(file.name, 'r') as f:
            retval = f.read()
    return retval

In [13]:
# Named Entity Recognition Code
def named_entity_recognition(model: str = None, model_to_compare: str = None, selected_input: str = "text", text: str = None, sample: str = None, file: str = None): 
    entities = {}
    models = [model, model_to_compare]
    text = input_preprocessing(selected_input, text, sample, file)
    for model_name in models:
        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER[model_name], model_max_length=512)
        model = NAMED_ENTITY_RECOGNITION_MODELS[model_name]
        nlp = pipeline('ner', model=model, tokenizer=tokenizer)
        model_entities = nlp(text)
        entities[model_name] = [{"entity": ent["entity"], "score": ent["score"], "index": ent["index"], "start": ent["start"], "end": ent["end"]} for ent in model_entities]
    
    legend_text_1 = generate_legend(entities[models[0]])
    legend_text_2 = generate_legend(entities[models[1]])
    return {"text": text, "entities": entities[models[0]]}, legend_text_1, {"text": text, "entities": entities[models[1]]}, legend_text_2

def generate_legend(entities):
    legend = {}
    for ent in entities:
        if ent['entity'] not in legend:
            legend[ent['entity']] = 1
        else:
            legend[ent['entity']] += 1
    legend_str = "Entities Legend:\n" + "\n".join([f"{ENTITY_DESCRIPTIONS.get(entity, entity)} ({entity}): {count}" for entity, count in legend.items()])
    return legend_str

In [14]:
# Code for Translation
def helsinki_translation(source_language: str = None, target_language: str = None, text: str = None):
    helsinki_translation_pipeline = pipeline('translation', model=f"Helsinki-NLP/opus-mt-{source_language}-{target_language}")
    helsinki_translation = helsinki_translation_pipeline(text)
    return helsinki_translation[0]['translation_text']

def facebook_translation(source_language: str = None, target_language: str = None, text: str = None):
    facebook_translation_pipeline = pipeline('translation', model=f"facebook/m2m100_418M", src_lang=source_language, tgt_lang=target_language)
    facebook_translation = facebook_translation_pipeline(text)
    return facebook_translation[0]['translation_text']

def translation(source_language: str = None, target_language: str = None, selected_input: str = None, text: str = None, sample: str = None, file: str = None):
    input = input_preprocessing(selected_input, text, sample, file)
    first_translation = helsinki_translation(source_language=source_language, target_language=target_language, text=input)
    second_translation = facebook_translation(source_language=source_language, target_language=target_language, text=input)
    return first_translation, second_translation, translation_data.loc[translation_data[source_language] == input, target_language].values[0]

In [15]:
# Code for Summarization
def summarization(model: str = None, selected_input: str = None, text: str = None, sample: str = None, file: str = None):
    text = input_preprocessing(selected_input, text, sample, file)
    summarization_pipeline = pipeline('summarization', model=SUMMARIZATION_MODELS[model])
    summary = summarization_pipeline(text)
    return summary[0]['summary_text'], summarization_data.loc[summarization_data['article'] == text, 'highlights'].values[0]

In [16]:
# Code for the Gradio Interface
def update_legend_and_output(model, model_to_compare, selected_input, text, samples, file):
    highlighted_text_output_1, legend_text_1, highlighted_text_output_2, legend_text_2 = named_entity_recognition(model, model_to_compare, selected_input, text, samples, file)
    return highlighted_text_output_1, legend_text_1, highlighted_text_output_2, legend_text_2

def named_entity_recognition_interface():
    with gr.Blocks() as blocks:
        model = gr.Dropdown(list(NAMED_ENTITY_RECOGNITION_MODELS.keys()), label="Model", value="BERT (CoNLL-03 English)")
        model_to_compare = gr.Dropdown(list(NAMED_ENTITY_RECOGNITION_MODELS.keys()), label="Model to Compare", value="Wikineural Multilingual (Babelscape)")
        text = gr.Textbox(lines=5, label="Input", value="Please enter the text to analyze.")
        samples = gr.Dropdown(list(named_entity_recognition_data.keys()), label="Samples", value="Introduction")
        file = gr.File(label="Upload a file")
        selected_input = gr.Radio([("Text", "text"), ("Sample", "sample"), ("File", "file")], label="Select Input Type", value="sample")

        highlighted_text_1 = gr.HighlightedText(label="Model 1 Result")
        legend_1 = gr.Markdown("Entities Legend for Model 1 will be shown here")
        
        highlighted_text_2 = gr.HighlightedText(label="Model 2 Result")
        legend_2 = gr.Markdown("Entities Legend for Model 2 will be shown here")

        inputs = [model, model_to_compare, selected_input, text, samples, file]
        for input_component in inputs:
            input_component.change(update_legend_and_output, inputs=inputs, outputs=[highlighted_text_1, legend_1, highlighted_text_2, legend_2])

        gr.Interface(
            fn=update_legend_and_output, 
            inputs=inputs, 
            outputs=[highlighted_text_1, legend_1, highlighted_text_2, legend_2], 
            title="Named Entity Recognition"
        )
        
    return blocks

def update_samples(source_language):
    samples = list(translation_data[source_language])
    return gr.update(choices=samples, value=samples[0])

def translation_interface():
    with gr.Blocks() as blocks:
        source_language = gr.Dropdown(
            choices=[("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Italian", "it")], 
            label="Source Language", 
            multiselect=False, 
            value="en"
        )

        target_language = gr.Dropdown(
            choices=[("English", "en"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Italian", "it")], 
            label="Target Language", 
            multiselect=False, 
            value="de"
        )

        text = gr.Textbox(lines=5, label="Input", value="Please enter the text to translate.")

        sample = gr.Dropdown(
            choices=list(translation_data["en"]), 
            label="Samples", 
            value=translation_data["en"][0]
        )

        file = gr.File(label="Upload a file")

        selected_input = gr.Radio(
            choices=[("Text", "text"), ("Sample", "sample"), ("File", "file")], 
            label="Select Input Type", 
            value="sample"
        )

        source_language.change(update_samples, inputs=source_language, outputs=sample)

        gr.Interface(
            fn=translation, 
            inputs=[source_language, target_language, selected_input, text, sample, file], 
            outputs=[gr.Textbox(label="Translation"), gr.Textbox(label="Alternative"), gr.Textbox(label="Ground Truth")], 
            title="Translation"
        )
        
    return blocks

def summarization_interface():
    with gr.Blocks() as blocks:
        model = gr.Dropdown(list(SUMMARIZATION_MODELS.keys()), label="Model", value="BART (CNN/DailyMail)")
        
        text = gr.Textbox(lines=5, label="Input", value="Please enter the text to summarize.")
        
        sample = gr.Dropdown(list(summarization_data["article"]), label="Samples", value=summarization_data["article"][0])
        
        file = gr.File(label="Upload a file")
        
        selected_input = gr.Radio(
            choices=[("Text", "text"), ("Sample", "sample"), ("File", "file")], 
            label="Select Input Type", 
            value="sample"
        )

        gr.Interface(
            fn=summarization, 
            inputs=[model, selected_input, text, sample, file], 
            outputs=[gr.Textbox(label="Summary"), gr.Textbox(label="Highlights (Ground Truth)")], 
            title="Summarization"
        )
        
    return blocks

def build_interface():
    interface = gr.TabbedInterface([
        named_entity_recognition_interface(),
        translation_interface(),
        summarization_interface()
    ], ["Named Entity Recognition", "Translation", "Summarization"], title="NLP Toolkit")
    return interface

interface = build_interface()
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


