### Data Preparation

#### Einlesen der Daten

In [None]:
# Sammlung aller importe
import os
import pandas as pd
import re

import tensorflow as tf
from datasets import load_dataset
from transformers import TFMT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq
from tensorflow.keras.optimizers import Adam

In [2]:
#Definition der Funktion zum Einlesen aller TXT Dateien aus den Daten
def find_and_read_txt_files(directory,string):
    # Liste zum Speichern des Inhalts der gefundenen .txt-Dateien und der zugehörigen Dateinamen-Nummer
    data = []

    # Durchsucht die Ordnerstruktur rekursiv nach .txt-Dateien
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Überprüft, ob es eine .txt-Datei ist und das Wort "source" im Dateinamen steht
            if file.endswith('.txt') and string in file:
                file_path = os.path.join(root, file)

                # Liest den Inhalt der .txt-Datei
                with open(file_path, 'r', encoding="latin1") as f:
                    content = f.read()

                    # Findet die Zahl im Dateinamen mithilfe eines regulären Ausdrucks
                    number = re.search(r'\d+', file).group()

                    # Fügt den Inhalt und die Zahl dem Datensatz hinzu
                    data.append([content, number])

    # Erstellt einen DataFrame aus den gesammelten Daten
    df = pd.DataFrame(data, columns=[string, 'number'])
    return df



In [None]:
directory_path = "/Users/huyduc/Documents/GitHub/Hettich/FileArchive_EDIFACT 2"  # Pfad zum Übergeordneten Ordner
df_target = find_and_read_txt_files(directory_path,'target')
df_source = find_and_read_txt_files(directory_path,'source')
#print(df)

In [3]:
df_merged = pd.merge(df_target, df_source, on='number', how='outer')

In [4]:
df_merged.isna().sum()

target      0
number      0
source    280
dtype: int64

In [5]:
df_merged2=df_merged.dropna()

In [6]:
df_merged2=df_merged2.sample(1000)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_merged2["source"], df_merged2["target"], test_size=0.2, random_state=42)

In [9]:
train=pd.DataFrame(columns=["source","target"])#y_train.to_csv("train.csv")

In [10]:
train["source"]=X_train
train["target"]=y_train
train=train.reset_index(drop=True)

In [11]:
train.to_csv("train.csv")

In [12]:
train

Unnamed: 0,source,target
0,UNA:+.? 'UNB+UNOC:3+3025940000000:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<ORDER..."
1,UNA:+.? 'UNB+UNOC:3+4333990000009:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?><ORDERS0..."
2,UNA:+.? 'UNB+UNOC:3+4399902231817:14+402300900...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<MULTI..."
3,UNA:+.? 'UNB+UNOC:3+4304449000000:14:431556399...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<ORDER..."
4,UNA:+.? 'UNB+UNOC:3+3025940000000:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<ORDER..."
...,...,...
795,UNA:+.? 'UNB+UNOD:3+4399901757592:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?><MULTIPL..."
796,UNA:+.? 'UNB+UNOC:3+4333990000009:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?><ORDERS0..."
797,UNA:+.? 'UNB+UNOC:3+4333990000009:14+400805700...,"<?xml version=""1.0"" encoding=""UTF-8""?><ORDERS0..."
798,UNA:+.? 'UNB+UNOC:3+4250517300001:14+9019970:1...,"<?xml version=""1.0"" encoding=""UTF-8""?><ORDERS0..."


In [13]:


tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")

All model checkpoint layers were used when initializing TFMT5ForConditionalGeneration.

All the layers of TFMT5ForConditionalGeneration were initialized from the model checkpoint at google/mt5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMT5ForConditionalGeneration for predictions without further training.


In [14]:

dataset = load_dataset("csv", data_files="train.csv")
dataset = dataset["train"].shuffle(seed=42)

def preprocess_function(examples):
    padding = "max_length"
    max_length = 200

    inputs = [ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Downloading and preparing dataset csv/default to /Users/huyduc/.cache/huggingface/datasets/csv/default-22411ccb5ea29676/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/huyduc/.cache/huggingface/datasets/csv/default-22411ccb5ea29676/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  if _pandas_api.is_sparse(col):


  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
dataset

Dataset({
    features: ['Unnamed: 0', 'source', 'target'],
    num_rows: 800
})

In [16]:
train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=64,
    return_tensors="np")

Running tokenizer:   0%|          | 0/800 [00:00<?, ? examples/s]

In [17]:
tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    batch_size=8,
    shuffle=True)

In [19]:
model.compile(optimizer=Adam(3e-5))
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.fit(tf_train_dataset, epochs=10, callbacks=[early_stopping])



Epoch 1/10


2023-10-25 21:22:59.234715: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
2023-10-25 21:23:00.259324: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp_10.


  1/100 [..............................] - ETA: 3:21:37 - loss: 29.6417