In [None]:
from transformers import pipeline

In [None]:

# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-de-en")


In [None]:
german_text = "Gesamtfläche"

input_ids = tokenizer.encode(german_text, return_tensors="pt")

output = model.generate(input_ids)

english_translation = tokenizer.decode(output[0], skip_special_tokens=True)

print("Deutsch:", german_text)
print("Englisch:", english_translation)

In [None]:
import pandas as pd
import numpy as np

inkar = pd.read_csv("../../data/inkar_2022.csv",sep=";",decimal=",",dtype={'Kennziffer':str,'Kennziffer_EU':str,'Name':str}, low_memory=False)
inkar.head()

In [None]:
raumbezug_selection = ["Kreise","Gemeinden", "Gemeindeverbände"]
temp = inkar[inkar['Raumbezug'].isin(raumbezug_selection)]

menu = temp.groupby(['Bereich','Indikator','ID']).aggregate({'Zeitbezug':['min','max']})
menu.columns = ["_".join(a) for a in menu.columns.to_flat_index()]
menu = menu.merge(temp.pivot_table(index=['Bereich','Indikator','ID'], columns='Raumbezug',values='Kennziffer',aggfunc=np.count_nonzero),left_index=True,right_index=True)

for column in raumbezug_selection:
    menu[column] = (menu[column]>0).replace({True: 'X', False: ""})

menu = menu.assign(
    select="",
    category="",
    category_de=menu.index.get_level_values('Bereich'),
    category_nl="",
    name="",
    name_de=menu.index.get_level_values('Indikator'),
    name_nl="",
    source_de="Inkar",
    source_id_de=menu.index.get_level_values('ID'),
    source_nl="",
    source_id_nl="",
)

column_order = ['select', 'category', 'category_de', 'category_nl', 'name', 'name_de', 'name_nl', 'source_de', 'source_id_de', 'source_nl', 'source_id_nl', 'Zeitbezug_min', 'Zeitbezug_max', 'Gemeinden', 'Gemeindeverbände', 'Kreise']
menu = menu[column_order]

menu.reset_index(inplace=True, drop=True)
menu

In [None]:
for index in menu.index:
    german_text = menu['category_de'][index]

    input_ids = tokenizer.encode(german_text, return_tensors="pt")
    output = model.generate(input_ids)
    english_translation = tokenizer.decode(output[0], skip_special_tokens=True)

    menu['category'][index] = english_translation
    print(menu['category'][index])

print(menu['category'])



In [None]:
for index in menu.index:
    print(index)
    german_text = menu['name_de'][index]

    input_ids = tokenizer.encode(german_text, return_tensors="pt")
    output = model.generate(input_ids)
    english_translation = tokenizer.decode(output[0], skip_special_tokens=True)

    menu['name'][index] = english_translation

In [None]:
menu.head()
menu.to_excel("C:/Users/juliu/VSCODE/VSCODE/inkar-indicators/src/out/output.xlsx", index=True)


In [None]:
import pandas as pd

inkar_übersicht = pd.read_csv("übersicht.csv",sep=";",decimal=",", low_memory=False)
inkar_übersicht

columns = inkar_übersicht.columns
for col in columns:
    print(col)

In [None]:

for index, ind in zip(inkar_übersicht.index, menu.index):
    if (index == 0 or index == 1 or index == 2):
        continue
    print(index)
    beschreibung = inkar_übersicht['Unnamed: 5'][index]
    german_text = menu['name_de'][ind]

    text_all = str(beschreibung) + " " + german_text

    input_ids = tokenizer.encode(text_all, return_tensors="pt")
    output = model.generate(input_ids)
    english_translation = tokenizer.decode(output[0], skip_special_tokens=True)

    menu['name'][index] = english_translation

In [None]:
for index in inkar_übersicht.index:
    if (index == 0 or index == 1 or index == 2):
        continue
    print(index)
    beschreibung = inkar_übersicht['Unnamed: 5'][index]
    name_beschreibung = inkar_übersicht['INKAR 2022 – Indikatorenübersicht:   Raumbeobachtung Deutschland'][index]
    print(name_beschreibung)
    for index in menu.index:
        german_text = menu['name_de'][index]
        if(name_beschreibung == german_text):
            print(name_beschreibung + " " + german_text)
            text_all = str(beschreibung) + " " + german_text  
            input_ids = tokenizer.encode(text_all, return_tensors="pt")
            output = model.generate(input_ids)
            english_translation = tokenizer.decode(output[0], skip_special_tokens=True)
            menu['name'][index] = english_translation

In [None]:
menu.head()
menu.to_excel("C:/Users/juliu/VSCODE/VSCODE/inkar-indicators/src/out/output_beschreibung.xlsx", index=True)