# Initialize

## Packages

In [None]:
# Google-Drive Mounting
import os
import sys

# Pandas
import pandas as pd

# Transformers
!pip install transformers
import transformers

# Datasets
!pip install datasets
import datasets

#NLTK
!pip install nltk
import nltk

#Sklearn (Performance Metric Calculation)
import sklearn

#Random for setting seeds
import random

#Install Google Translator
!pip install googletrans==4.0.0-rc1

#Import pickle
import pickle




## Mounting

In [None]:
# Google-Drive Mounting

from google.colab import drive
drive.mount('/content/drive')

# Anpassen der Working Directory
os.chdir('drive/MyDrive')

# Working directory
!ls


# Data

## Read files

In [None]:
#Read data
def read_excel(excel_path, textcolumn = None, labelcolumn = None, maxrow = None):
  #Read excel
  df = pd.DataFrame(pd.read_excel(excel_path))
  if textcolumn != None:
    df = df[[textcolumn, labelcolumn]]
    #Rename columns: Text in column 0, label in column 1 (necessary for model-training)
    df.columns = ['text', 'labels']
  #Limit data to maximum number of rows (for testing)
  if maxrow != None:
    df = df.iloc[:maxrow, :]
  return df



In [None]:

#Training data
training_nlp = read_excel('0_Data_General/04_RP_Unfalltyp2TRAIN.xlsx', 'Description', 'AccidentType')
training_full = read_excel('0_Data_General/04_RP_Unfalltyp2TRAIN.xlsx')

#Test data
testing_nlp = read_excel('0_Data_General/04_RP_Unfalltyp2TEST.xlsx', 'Description', 'AccidentType')
testing_full = read_excel('0_Data_General/04_RP_Unfalltyp2TEST.xlsx')

#Validation data
validation_nlp = read_excel('0_Data_General/04_RP_Unfalltyp2VALID.xlsx','Description', 'AccidentType')
validation_full = read_excel('0_Data_General/04_RP_Unfalltyp2VALID.xlsx')

In [None]:
#All possible turning 3ATs
classes_list = [201, 202, 203, 204, 209,
                211, 212, 213, 214, 215, 219,
                221, 222, 223, 224, 225, 229,
                231, 232, 233, 239,
                241, 242, 243, 244, 245, 249,
                251, 252, 259,
                261, 262, 269,
                271, 272, 273, 274, 275, 279,
                281, 282, 283, 284, 285, 286, 289,
                299]

#Recode
def recode(df, classes_list):
  #Copy existing dataframe
  df_new = df.copy()
  #Set labels to zero according to classlist
  df_new['labels'] = df_new.apply(lambda x: classes_list.index(x['labels']), axis = 1)
  return df_new



In [None]:
#Let data start at zero
train = recode(training_nlp, classes_list)

test = recode(testing_nlp, classes_list)

valid = recode(validation_nlp, classes_list)

# Text preprocessing



## Align / correct accident descriptions

In [None]:
#Input
#Text | Label
#Align accident descriptions
def correct_participant(df):
  df_new = df.copy()
  #Replace certain words, but not case sensitive -> lower / upper cases do not matter
  #Participant 1
  df_new["text"]= df_new["text"].str.replace("ON 01", "Beteiligter 1", case = False)
  df_new["text"]= df_new["text"].str.replace("ON01", "Beteiligter 1", case = False)
  df_new["text"]= df_new["text"].str.replace("01", "Beteiligter 1", case = False)
  df_new["text"]= df_new["text"].str.replace("Teilnehmer 1", "Beteiligter 1", case = False) #relevant for data augmentation
  #Participant 2
  df_new["text"]= df_new["text"].str.replace("ON 02", "Beteiligter 2", case = False)
  df_new["text"]= df_new["text"].str.replace("ON02", "Beteiligter 2", case = False)
  df_new["text"]= df_new["text"].str.replace("02", "Beteiligter 2", case = False)
  df_new["text"]= df_new["text"].str.replace("Teilnehmer 2", "Beteiligter 2", case = False) #relevant for data augmentation
  ##_x000D_
  df_new["text"] = df_new["text"].str.replace("_x000D_", " ", case = False)
  df_new["text"] = df_new["text"].str.replace("\n", " ", case = False)
  #Return
  return df_new

#Fill empty accident descriptions
def correct_na(df):
  df.text = df.text.fillna('Keine Unfallbeschreibung vorhanden.')
  return df

#Split into single paragraphs
def split_paragraphs(data):
  #Donwload nltk
  nltk.download("punkt")
  #Split into list of sentences
  data["text"] = data.apply(lambda row: nltk.tokenize.sent_tokenize(row["text"]), axis = 1)
  return data


#Transform to huggingface dataset
def transform_dataset(data):
  #Import classes
  from datasets import Dataset, DatasetDict
  #transform to pandas
  dataset = Dataset.from_pandas(data)
  #Return
  return dataset


#Correct language main function
def main_correct(df):
  #Change accident description
  df_lang = correct_participant(df)
  #Correct empty descriptions
  df_na = correct_na(df_lang)
  #Split into single paragraphs
  #df_split = split_paragraphs(df_na)
  #WTransform to dataset
  df_dataset = transform_dataset(df_na)
  #Rückgabe
  return df_dataset

#Create datasetdict
def create_datasetdict(train, test = None, valid = None):
  from datasets import DatasetDict
  #Create DatasetDict
  ds = DatasetDict()
  #Conduct corrections
  try:
    train_c = main_correct(train)
    ds["train"] = train_c
    print("training data included")
  except ValueError:
    print("No training data inserted")
  try:
    if test is None:
      print("No test data inserted")
    else:
      test_c = main_correct(test)
      ds["test"] = test_c
      print("test data included")
  except ValueError:
    print("No test data inserted")
  try:
    if valid is None:
      print("No validation data inserted")
    else:
      valid_c = main_correct(valid)
      ds["valid"] = valid_c
      print("validation data included")
  except ValueError:
    print("No validation data inserted")
  #Return
  return ds


In [None]:
#Create final Dataset for model training

dataset = create_datasetdict(train, test, valid)

In [None]:
# Save Dataset (before tokenization starts)
import pickle
import sys

abspath = os.path.abspath('0_Ergebnisse/220308_dataset.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(dataset, handle, protocol = pickle.HIGHEST_PROTOCOL)

## Data Augmentation

Easy Data Augmentation

In [None]:
from googletrans import Translator
import time
from tqdm import tqdm


#Function for translation to English of one sentence
def translate_sent(sentence, language, timestop):
  #Time break to not overload API
  time.sleep(timestop)
  #Create new Translator
  translator = Translator()
  #Translate
  sentence_eng = translator.translate(sentence, dest = language).text
  #return
  return sentence_eng


#Function to translate whole dataframe
def main_translate(df, language, timestop):
  #Activate tqdm
  tqdm.pandas()
  #Copy existing dataframe
  df_new = df.copy()
  #Translate
  df_new["text"] = df_new.progress_apply(lambda row: translate_sent(sentence = row["text"], language = language, timestop = timestop), axis = 1)
  #Return
  return df_new


In [None]:
###Translate train data into English
#Save translation
import pickle
import sys

#Conduction correct participant function -> Replace ON01
train_corrected = correct_participant(train)
#Remove NAs
train_corrected = correct_na(train_corrected)

#Translate into English with 1 second break after each translation, works with 5 seconds
train_en = main_translate(df = train_corrected, language = "en", timestop = 4)

#Save translated data
abspath = os.path.abspath('0_Ergebnisse/230422_train_en.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(train_en, handle, protocol = pickle.HIGHEST_PROTOCOL)



In [None]:
####Translate data back from English to German
import pickle
import sys

#File which should be translated back from English to German
abspath = os.path.abspath('0_Ergebnisse/230422_train_en.pkl')

# pkl einlesen. Colab unterstützt nur Pickle-Standard 4, während die Dateien aus Doc2Vec in Pickle-Standard 5 gespeichert wurden
with open(str(abspath), 'rb') as pkl:
  train_en = pickle.load(pkl)


#Translate back from English to German with break after each translation, works with 5 seconds
train_en_de = main_translate(df = train_en, language = "de", timestop = 4)


#Save translated data
abspath = os.path.abspath('0_Ergebnisse/230422_train_en_de.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(train_en_de, handle, protocol = pickle.HIGHEST_PROTOCOL)




In [None]:
#Combination of single data with already augmented data

#File with augmented data
abspath = os.path.abspath('0_Ergebnisse/230422_train_en_de.pkl')

# read ülö
with open(str(abspath), 'rb') as pkl:
  train_en_de = pickle.load(pkl)

#Correct augmented data
train_en_de_corrected = correct_participant(train_en_de)

#Combinate original training data with augmented training data
train_aug = pd.concat([train, train_en_de_corrected])

In [None]:
#Create final augmented Dataset for model training

dataset_aug = create_datasetdict(train_aug, test, valid)

In [None]:
#Save augmented dataset

abspath = os.path.abspath('0_Ergebnisse/220425_dataset_aug.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(dataset_aug, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Import augmented dataset

#File with augmentet dataset
abspath = os.path.abspath('0_Ergebnisse/220425_dataset_aug.pkl')

# pkl einlesen.
with open(str(abspath), 'rb') as pkl:
  dataset_aug = pickle.load(pkl)

## Tokenization

## Tokenizer: Functions

In [None]:
#Tokenize Data
#Import AutoTokenizer
from transformers import AutoTokenizer


#Tokenize Function
def tokenize_function(data):
  tokens = tokenizer(data["text"], truncation = True)
  return tokens

#Tokenize and set format: Takes Dataset as Input (Train and Test data)
def main_tokenize(dataset, tokenizer):
  #Initialize Tokenizer
  tokenizer = tokenizer
  #Explanation: https://huggingface.co/docs/transformers/preprocessing
  #Tokenize
  dataset = dataset.map(tokenize_function, batched = True)
  #Set Format and return tensors
  dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
  #Return
  return dataset


In [None]:
#Tokenize and preprocess data for model: Without augmentation
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')

#Dataset Bert-Base-German-Cased (BBGC)
dataset_bbgc = main_tokenize(dataset, tokenizer)

In [None]:
#Tokenize and preprocess data for model: With augmentation
#Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')

#Dataset Bert-Base-German-Cased (BBGC) with augmentation
dataset_aug_bbgc = main_tokenize(dataset_aug, tokenizer)

In [None]:
#Save tokenized datasets: Without augmentation


abspath = os.path.abspath('0_Ergebnisse/220308_dataset_bbgc_pure.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(dataset_bbgc, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Save tokenized datasets: With augmentation


abspath = os.path.abspath('0_Ergebnisse/220425_dataset_aug_bbgc_pure.pkl')
print(abspath)

with open(str(abspath), 'wb') as handle:
  pickle.dump(dataset_aug_bbgc, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
#Import tokenized datasets

#Tokenized dataset without data augmentation
abspath = os.path.abspath('0_Ergebnisse/220308_dataset_bbgc_pure.pkl')

# pkl einlesen.
with open(str(abspath), 'rb') as pkl:
  dataset_bbgc = pickle.load(pkl)



#Tokenized dataset with data augmentation
abspath = os.path.abspath('0_Ergebnisse/220425_dataset_aug_bbgc_pure.pkl')

# pkl einlesen.
with open(str(abspath), 'rb') as pkl:
  dataset_aug_bbgc = pickle.load(pkl)

# Modelling



## Model: General functions

In [None]:
#Calculate Model Performance Metrics

#Function for metric computation
def compute_metrics(eval_pred):
  #Sci-Kit-learn for evaluation
  from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, f1_score
  #Numpy
  import numpy as np
  #Get metrics
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)

  accuracy = accuracy_score(y_true = labels, y_pred = predictions)
  recall_micro = recall_score(y_true = labels, y_pred = predictions, average = "micro")
  precision_micro = precision_score(y_true = labels, y_pred = predictions, average = "micro")
  f1_micro = f1_score(y_true = labels, y_pred = predictions, average = "micro")

  recall_macro = recall_score(y_true = labels, y_pred = predictions, average = "macro")
  precision_macro = precision_score(y_true = labels, y_pred = predictions, average = "macro")
  f1_macro = f1_score(y_true = labels, y_pred = predictions, average = "macro")

  #return metric.compute(predictions = predictions, references = labels)
  return {"accuracy": accuracy, "precision_micro": precision_micro, "recall_micro": recall_micro, "f1_micro": f1_micro,  "precision_macro": precision_macro, "recall_macro": recall_macro, "f1_macro": f1_macro}


## Model: Finetuning BBGC with Data without Augmentation (BBGCPURE)

In [None]:
#Reference: https://huggingface.co/course/chapter3/3?fw=pt
#Data Collator, which concatenates the single tensors within a batch
#Finetuning model: https://huggingface.co/course/chapter3/2?fw=pt
#Trainer class: https://huggingface.co/docs/transformers/main_classes/trainer

from transformers import Trainer, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments

#Set Seed
random.seed(10)

#Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
#Dynamic padding using data collator
data_collator = DataCollatorWithPadding(tokenizer)
#Specify training arguments: First Argument is path were model is stored (limitation also ver "max_steps = X" possible )
training_args = TrainingArguments(output_dir = "bertbasegermancased_pure", overwrite_output_dir = False,  evaluation_strategy = "steps", num_train_epochs = 20, load_best_model_at_end = True)


In [None]:
#Modelconfiguration:
#Bert-Base-German-Cased | Pure training data without any augmentation, just the basic language correction | 20 epochs

#Determine model
model_bbgc_puredata = AutoModelForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=len(classes_list))
#Specify trainer
trainer_bbgc_puredata = Trainer(
    model_bbgc_puredata,
    training_args,
    train_dataset= dataset_bbgc["train"],
    eval_dataset= dataset_bbgc["test"],
    data_collator= data_collator,
    compute_metrics=compute_metrics
)

In [None]:
#Execute training
trainer_bbgc_puredata.train()

In [None]:
#Evaluate model on test data set
preds_results, preds_labels, preds_metrics = trainer_bbgc_puredata.predict(dataset_bbgc["test"])  #All metrics and predictions

print(preds_metrics)

## Model: Finetuning BBGC with Data with Augmentation (BBGCAUG)

In [None]:
#Reference: https://huggingface.co/course/chapter3/3?fw=pt
#Data Collator, which concatenates the single tensors within a batch
#Finetuning model: https://huggingface.co/course/chapter3/2?fw=pt
#Trainer class: https://huggingface.co/docs/transformers/main_classes/trainer

from transformers import Trainer, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments

#Set Seed
random.seed(10)

#Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
#Dynamic padding using data collator
data_collator = DataCollatorWithPadding(tokenizer)
#Specify training arguments: First Argument is path were model is stored (limitation also ver "max_steps = X" possible )
training_args = TrainingArguments(output_dir = "bertbasegermancased_aug", overwrite_output_dir = False,  evaluation_strategy = "steps", num_train_epochs = 20, load_best_model_at_end = True)


In [None]:
#Modelconfiguration:
#Bert-Base-German-Cased | training data with data augmentation, data was translated to english and back again and then just appended to the original training data | 20 epochs

#Determine model
model_bbgc_augdata = AutoModelForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=len(classes_list))
#Specify trainer
trainer_bbgc_augdata = Trainer(
    model_bbgc_augdata,
    training_args,
    train_dataset= dataset_aug_bbgc["train"],
    eval_dataset= dataset_aug_bbgc["test"],
    data_collator= data_collator,
    compute_metrics=compute_metrics
)

In [None]:
#Execute training
trainer_bbgc_augdata.train()

In [None]:
#Model BBGCAUG:
#Loading best model from bertbasegermancased_pure/checkpoint-3500 (score: 0.6726800799369812).
#TrainOutput(global_step=26000, training_loss=0.30083916932000565, metrics={'train_runtime': 6944.3512, 'train_samples_per_second': 29.941, 'train_steps_per_second': 3.744, 'total_flos': 2.552274605904876e+16, 'train_loss': 0.30083916932000565, 'epoch': 20.0})

#Evaluate model on test data set
preds_results, preds_labels, preds_metrics = trainer_bbgc_augdata.predict(dataset_aug_bbgc["test"])  #Alle Metriken und alle Vorhersagen


print(preds_metrics)

# Prediction

### Trainer: BBGCPure with Finetuning and without Data augmentation

BertBaseGermanCased Model trained with data without additional data augmentation

In [None]:
#Import necessary classes
from transformers import Trainer, AutoModelForSequenceClassification, TrainingArguments, AutoTokenizer, DataCollatorWithPadding

#Set Seed
random.seed(10)

#Load pretrained and finetuned model from storage:
#path_bbgcpure = "bertbasegermancase/checkpoint-2000"
path_bbgcpure = "bertbasegermancased_pure/checkpoint-1500"

#Initialize loaded model
model_bbgcpure = AutoModelForSequenceClassification.from_pretrained(path_bbgcpure, num_labels=len(classes_list), output_hidden_states = False) #HiddenStatesOutput = True für sentence embeddings

#Set model in evaluation mode: Just forward propagation
#model_bbgcpure.eval() #activate for embeddings / vectors

#Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
#Dynamic padding using data collator
data_collator = DataCollatorWithPadding(tokenizer)


#Define prediction trainer
trainer_bbgcpure = Trainer(model_bbgcpure)


trainer_bbgcpure = Trainer(
    model_bbgcpure,
    data_collator= data_collator,
    compute_metrics=compute_metrics, #activate for embeddings / vectors
)



### Trainer: BBGCAug with Finetuning and with Data augmentation

BertBaseGermanCased Model trained with data with additional data augmentation

In [None]:
#Import necessary classes
from transformers import Trainer, AutoModelForSequenceClassification, TrainingArguments, AutoTokenizer, DataCollatorWithPadding

#Set Seed
random.seed(10)

#Load pretrained and finetuned model from storage:
#path_bbgcpure = "bertbasegermancase/checkpoint-2000"
path_bbgc_augdata = "bertbasegermancased_aug/checkpoint-3500"

#Initialize loaded model
model_bbgc_augdata = AutoModelForSequenceClassification.from_pretrained(path_bbgc_augdata, num_labels=len(classes_list), output_hidden_states = False) #output_hidden_states = True für sentence embeddings

#Set model in evaluation mode: Just forward propagation
#model_bbgc_augdata.eval() #activate for sentence embeddings

#Initialize Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-german-cased')
#Dynamic padding using data collator
data_collator = DataCollatorWithPadding(tokenizer)


#Define prediction trainer
trainer_bbgc_augdata = Trainer(model_bbgc_augdata)


trainer_bbgc_augdata = Trainer(
    model_bbgc_augdata,
    data_collator= data_collator,
    compute_metrics=compute_metrics, #activate for vectors
)



### Prediction: General functions

In [None]:
# Functions for predictions

#Slice dataset into several batches, since otherwise the RAM is not enough.
#https://huggingface.co/docs/datasets/v1.1.1/package_reference/main_classes.html#datasets.Dataset.shard


from torch import torch
import numpy as np


def flatten_list(listtobeflatten):
  return [item for sublist in listtobeflatten for item in sublist ]



def make_prediction(dataset, numberofbatches, trainer, startlayer, endlayer):
  #Startlayer should be the second layer
  #Endlayer should be the last layer (index starts at 0, bert-model has 12 layers + 1 input embedding layer, which is the first layer)
  #Recommended values are therefore: startlayer = 1, endlayer = 12

  datalist = []


  for shard in range(0, numberofbatches, 1):
    #Divide dataset into batches to save RAM
    data = dataset.shard(numberofbatches, shard, contiguous = True)

    #Make prediction
    with torch.no_grad():
      preds_overview = trainer.predict(data)

    #Get predictions, which have the format of the model output: #https://huggingface.co/docs/transformers/main_classes/output
    #predictions = tuple with 2 entries
    #Length = Sum of accident cases, every entry has length of 47, which corresponds to number of 3ATs.
    predictions = preds_overview[0]

    #Get hidden states
    #Hidden states are the second entrie of the prediction tuple: https://huggingface.co/docs/transformers/v4.17.0/en/model_doc/bert#transformers.models.bert.modeling_bert.BertForPreTrainingOutput.hidden_states
    #Hidden states are a tuple with length 13: 1 input embeddings + 12 bert specific layer
    hidden_states = predictions[1]

    #Convert to tensors
    #Current dimensions: layers, batches, tokens, features
    tensors = tuple(map(lambda layer: torch.from_numpy(layer), hidden_states))

    #Combine the different layers to make one whole big tensor
    token_embeddings = torch.stack(tensors, dim = 0)

    #Reorder the embeddings to: accidents(batch) / layer / tokens / features
    token_embeddings = token_embeddings.permute(1,0,2,3)

    #Get / calculate sentence embeddings
    sentence_embeddings = []

    for batch in range(len(token_embeddings)):

      #Select single accident
       #Accident hat form: torch.Size([13, 467, 768])
      accident = token_embeddings[batch]

      #Select start to end layers
      accidentstarttoendlayer = accident[startlayer:endlayer]

      #Average layers
      #Average layers: [467, 768]
      layers = torch.mean(accidentstarttoendlayer, dim = 0)

      #Average tokens to get sentence embeddings
      #Average tokens: [768]
      tokens = torch.mean(layers, dim = 0)

      #Append to list
      #Create list with length of batch size and one tensor with 768 dimensions.
      sentence_embeddings.append(tokens)

    #Append to list
    datalist.append(sentence_embeddings)

    #Flatten list
    result = flatten_list(datalist)

  return result





### Prediction: BBGCPure with Finetuning and without data augmentation

In [None]:
#perform prediction
#Test data
#Prediction with Bert internal classifier

#Prediction:
v7_preds_results_test, v7_preds_labels_test, v7_preds_metrics_test = trainer_bbgcpure.predict(dataset_bbgc["test"])

In [None]:
#perform prediction
#Validation Data
#Prediction with Bert internal classifier

#Prediction:
v7_preds_results, v7_preds_labels, v7_preds_metrics = trainer_bbgcpure.predict(dataset_bbgc["valid"])

In [None]:
#Results
#Test data
#Prediction:
import numpy as np

print(v7_preds_metrics_test)

v7_results_test = pd.DataFrame(dataset_bbgc["test"]["labels"])
v7_results_test.loc[:, "Pred_test"] = pd.DataFrame(np.argmax(v7_preds_results_test, axis = -1))
v7_results_test.columns = ["Y_test", "Pred_test"]
v7_results_test

In [None]:
#Save prediction results

v7_results_test.to_pickle('0_Ergebnisse/Bert/results/V7_220703_test_preds_0.81')


In [None]:
#Results
#Validation data
#Prediction:
import numpy as np

print(v7_preds_metrics)

v7_results = pd.DataFrame(dataset_bbgc["valid"]["labels"])
v7_results.loc[:, "Pred_valid"] = pd.DataFrame(np.argmax(v7_preds_results, axis = -1))
v7_results.columns = ["Y_valid", "Pred_valid"]
v7_results


In [None]:
#Save prediction results

v7_results.to_pickle('0_Ergebnisse/Bert/results/V7_220702_valid_preds_0.80')


In [None]:
# Perform prediction
#test = dataset_bbgc["valid"].select(range(0,100,1))

#Get sentence embeddings for validation dataset with bbgcpure
#Configuration: Dataset, 100 batches, Trainer, Startlayer = second layer, EndLayer = final layer
sent_embed_valid_bbgcpure = make_prediction(dataset_bbgc["valid"], 100, trainer_bbgcpure, 1, 12)
sent_embed_train_bbgcpure = make_prediction(dataset_bbgc["train"], 100, trainer_bbgcpure, 1, 12)
sent_embed_test_bbgcpure = make_prediction(dataset_bbgc["test"], 100, trainer_bbgcpure, 1, 12)

#Check length
#len(sent_embed_valid_bbgcpure)

In [None]:
#Save predicted sentence embeddings

import pickle
import sys


#Wording: Datum_Versuchsnummer_Modell_Augmenation_Datensatz_Typ.pkl
abspath_valid_bbgcpure = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_valid_embeddings.pkl')
abspath_train_bbgcpure = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_train_embeddings.pkl')
abspath_test_bbgcpure = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_test_embeddings.pkl')


with open(str(abspath_valid_bbgcpure), 'wb') as handle:
  pickle.dump(sent_embed_valid_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(str(abspath_train_bbgcpure), 'wb') as handle:
  pickle.dump(sent_embed_train_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(str(abspath_test_bbgcpure), 'wb') as handle:
  pickle.dump(sent_embed_test_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)

### Prediction: BBGCAug with Finetuning and with augmentation data

In [None]:
#Prediction with Bert internal classifier
#Test data

v8_preds_results_test, v8_preds_labels_test, v8_preds_metrics_test = trainer_bbgc_augdata.predict(dataset_aug_bbgc["test"])

In [None]:
#Results
#Test data
#Prediction:
import numpy as np

print(v8_preds_metrics_test)

v8_results_test = pd.DataFrame(dataset_bbgc["test"]["labels"])
v8_results_test.loc[:, "Pred_test"] = pd.DataFrame(np.argmax(v8_preds_results_test, axis = -1))
v8_results_test.columns = ["Y_test", "Pred_test"]
v8_results_test

In [None]:
#Save prediction results

v8_results_test.to_pickle('0_Ergebnisse/Bert/results/V8_220703_test_preds_0.80')

In [None]:
#Prediction with Bert internal classifier
#Validation data

v8_preds_results, v8_preds_labels, v8_preds_metrics = trainer_bbgc_augdata.predict(dataset_aug_bbgc["valid"])



In [None]:
#Results
#Prediction:
import numpy as np

print(v8_preds_metrics)

v8_results = pd.DataFrame(dataset_bbgc["valid"]["labels"])
v8_results.loc[:, "Pred_valid"] = pd.DataFrame(np.argmax(v8_preds_results, axis = -1))
v8_results.columns = ["Y_valid", "Pred_valid"]
v8_results


In [None]:
#Save prediction results

v8_results.to_pickle('0_Ergebnisse/Bert/results/V8_220702_valid_preds_0.79')

In [None]:
#Get sentence embeddings for validation dataset with bbgcaug
#Configuration: Dataset, 100 batches, Trainer, Startlayer = second layer, EndLayer = final layer
sent_embed_valid_bbgcaug = make_prediction(dataset_aug_bbgc["valid"], 100, trainer_bbgc_augdata, 1, 12)
sent_embed_train_bbgcaug = make_prediction(dataset_aug_bbgc["train"], 100, trainer_bbgc_augdata, 1, 12)
sent_embed_test_bbgcaug = make_prediction(dataset_aug_bbgc["test"], 100, trainer_bbgc_augdata, 1, 12)


In [None]:
#Save predicted sentence embeddings

#Wording: Datum_Versuchsnummer_Modell_Augmenation_Datensatz_Typ.pkl
abspath_valid_bbgcaug = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_valid_embeddings.pkl')
abspath_train_bbgcaug = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_train_embeddings.pkl')
abspath_test_bbgcaug = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_test_embeddings.pkl')


with open(str(abspath_valid_bbgcaug), 'wb') as handle:
  pickle.dump(sent_embed_valid_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(str(abspath_train_bbgcaug), 'wb') as handle:
  pickle.dump(sent_embed_train_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(str(abspath_test_bbgcaug), 'wb') as handle:
  pickle.dump(sent_embed_test_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)

# Postprocessing

###Postprocessing: General Functions

In [None]:
#Functions to process the predicted data and to combine it with the original dataframe

from torch import take
import pandas as pd

#Convert sentence embeddings to pandas dataframe
def embeddings_to_df(embeddings):

  #Create empty list
  list_of_df = []

  for tensor in range(len(embeddings)):

    #Convert to numpy arrays
    arrays = embeddings[tensor].numpy()

    #Convert to dataframe
    df = pd.DataFrame(arrays)

    #Transpose dataframe
    df.t = df.transpose()

    #Append to list
    list_of_df.append(df.t)

  #Concatenate list of dataframes to one big dataframe
  #https://pandas.pydata.org/docs/reference/api/pandas.concat.html
  df.con = pd.concat(list_of_df, ignore_index = True )

  #Return
  return df.con

#Merge sentence embeddings dataframe with original dataset
def merge_full_embeddings(dataset_full, sentence_embeddings):

  df = pd.merge(dataset_full, sentence_embeddings, left_index = True, right_index = True)

  return df

#Main function for postprocessing
def main_postprocess(dataset_full, embeddings):

  #Convert embeddings to pandas dataframe
  embed_pd = embeddings_to_df(embeddings)

  #Merge full dataset with converted embeddings
  df = merge_full_embeddings(dataset_full, embed_pd)

  #Return
  return df







###Postprocessing: BBGCPure with Finetuning and without data augmentation

In [None]:
#Do postprocessing: Combine dataframes

validation_bbgcpure = main_postprocess(validation_full, sent_embed_valid_bbgcpure)
training_bbgcpure = main_postprocess(training_full, sent_embed_train_bbgcpure)
testing_bbgcpure = main_postprocess(testing_full, sent_embed_test_bbgcpure)

#validation_bbgcpure

In [None]:
#Save final combined dataframe

import pickle
import sys


#Wording: Datum_Versuchsnummer_Modell_Augmenation_Datensatz_Typ.pkl
#Validation
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_valid_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(validation_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Training
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_train_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(training_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)



#Testing
abspath = os.path.abspath('0_Ergebnisse/220402_0_bbgc_pure_test_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(testing_bbgcpure, handle, protocol = pickle.HIGHEST_PROTOCOL)





# Save as excel

validation_bbgcpure.to_excel('0_Ergebnisse/220402_0_bbgc_pure_valid_fulldataset.xlsx')

training_bbgcpure.to_excel('0_Ergebnisse/220402_0_bbgc_pure_train_fulldataset.xlsx')

testing_bbgcpure.to_excel('0_Ergebnisse/220402_0_bbgc_pure_test_fulldataset.xlsx')


###Postprocessing: BBGCAug with Finetuning and with data augmentation

In [None]:
#Do the postprocessing

validation_bbgcaug = main_postprocess(validation_full, sent_embed_valid_bbgcaug)
training_bbgcaug = main_postprocess(training_full, sent_embed_train_bbgcaug)
testing_bbgcaug = main_postprocess(testing_full, sent_embed_test_bbgcaug)

validation_bbgcaug

In [None]:
#Save final combined data frames

#Wording: Datum_Versuchsnummer_Modell_Augmenation_Datensatz_Typ.pkl
#Validation
abspath = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_valid_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(validation_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)

#Training
abspath = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_train_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(training_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)



#Testing
abspath = os.path.abspath('0_Ergebnisse/220426_0_bbgc_aug_test_fulldataset.pkl')

with open(str(abspath), 'wb') as handle:
  pickle.dump(testing_bbgcaug, handle, protocol = pickle.HIGHEST_PROTOCOL)





# Save as excel

validation_bbgcaug.to_excel('0_Ergebnisse/220426_0_bbgc_aug_valid_fulldataset.xlsx')

training_bbgcaug.to_excel('0_Ergebnisse/220426_0_bbgc_aug_train_fulldataset.xlsx')

testing_bbgcaug.to_excel('0_Ergebnisse/220426_0_bbgc_aug_test_fulldataset.xlsx')