In [None]:
# use this notebook in colab because it can be too much for a standard GPU

! pip install transformers
! pip install datasets

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
from google.colab import drive
drive.mount("/content/gdrive")

from datasets import load_dataset, load_metric
import datasets
import random
import pandas as pd
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, 
import numpy as np
import torch

train_path = "/content/gdrive/My Drive/data/train_from0.csv"
test_path = "/content/gdrive/My Drive/data/train_from0.csv"
sentence1_key = "Narrative"
model_checkpoint = "bert-base-german-cased" # "distilbert-base-uncased"   bert-base-german-cased
batch_size = 2
num_labels = 6

dataset = load_dataset('csv', data_files={"train": train_path, "validation": test_path})
metric = load_metric("accuracy")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

if model_checkpoint == "gpt2":
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    return tokenizer(examples[sentence1_key], truncation=True, padding=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels =num_labels)
metric_name = "accuracy"

args = TrainingArguments(
    "final",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

model.is_parallelizable = False

FeatureExtractionPipeline(model, tokenizer)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()
evaluation = trainer.evaluate()

trainer.save_model("/content/gdrive/My Drive/data/final_model")

# later use the saved folder in the forward_app.ipynb to automatically classify and forward the emails