<a href="https://colab.research.google.com/github/Mohamed-ux-beep/Named-entity-recognition/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**importing the needed libraries**

In [1]:
import pandas as pd
from itertools import islice
import numpy as np
import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from transformers import pipeline
import random
import re
import csv

**Reproducability**

In [None]:
def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
seed_everything(42)

**Data Reading section**

In [None]:
 # training data
df = pd.read_csv('/content/train.tsv', header=None, on_bad_lines='skip')
df.head()

Unnamed: 0,0
0,#\tn-tv.de vom 26.02.2005\t[2005-02-26]\t
1,1\tSchartau\tB-PER\tO
2,2\tsagte\tO\tO
3,3\tdem\tO\tO
4,"4\t""\tO\tO"


In [None]:
df_val = pd.read_csv('/content/val.tsv', header=None, on_bad_lines='skip')
df_val.head()

Unnamed: 0,0
0,#\thttp://www.heise.de/tp/r4/artikel/32/32883/...
1,1\tGleich\tO\tO
2,2\tdarauf\tO\tO
3,3\tentwirft\tO\tO
4,4\ter\tO\tO


In [6]:
df_test = pd.read_csv('/content/test_x.tsv', header=None, on_bad_lines='skip')
df_test.head()

Unnamed: 0,0
0,1\tFür
1,2\tseine
2,3\t40.
3,4\tBlutspende
4,5\twurde


In [None]:
def process_df(df, test):
  if not test:
    df[['index', 'Token', '1st-Tag', '2nd-Tag']] = df[0].str.split('\t', expand=True)
    del df[0], df['2nd-Tag']
    df = df[df['index']!='#']
    df.reset_index(inplace=True)
    del df['level_0']
  if test:
    df[['index', 'Token']] = df[0].str.split('\t', expand=True)
    del df[0]
  return df

In [None]:
df = process_df(df, False)
df_val = process_df(df_val, False)
df_test = process_df(df_test, True)

In [None]:
def process_labels(x):
  if x not in ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']:
    x = 'O'
  return x

In [None]:
df['1st-Tag'] = df['1st-Tag'].apply(process_labels)

In [None]:
df_val['1st-Tag'] = df['1st-Tag'].apply(process_labels)

**Labels preparation**

In [None]:
B_label = df['1st-Tag'].unique().tolist()

In [None]:
output_labels = list(set(B_label))

In [None]:
len(output_labels)

7

In [None]:
label2id = {k: v for v, k in enumerate(output_labels)}
id2label = {v: k for v, k in enumerate(output_labels)}

**Until hier**

In [None]:
# Correction of false labeled data
df['index'][128716] = '1'
df['index'][310049] = '1'
df['index'][364038] = '1'

**getting the sentences and their corresponding labels**

In [None]:
def get_boundaries(df):
  boundaries = []
  for indx, row in islice(df.iterrows(), 0, None):
    if indx == len(df)-1:
      boundaries.append(indx+1)
    for col in df.columns:
      if col == 'index':
        if df[col][indx] == '1':
          boundaries.append(indx)
  return boundaries

In [None]:
boundaries_tr = get_boundaries(df)
boundaries_val = get_boundaries(df_val)
boundaries_tst = get_boundaries(df_test)

In [None]:
len(boundaries_tr), len(boundaries_val), len(boundaries_tst)

(23984, 2200, 3063)

In [None]:
def get_sentences(boundaries, df, test):
  sentences = []
  labels = []
  for i in range(len(boundaries)):
    try:
      fromm = boundaries[i]
      to = boundaries[i+1]
    except:
      continue
    ndf = df[fromm:to]
    if not test:
      tags = list(ndf['1st-Tag'])
      labels.append(tags)
    sent = ' '.join(ndf['Token'])
    sentences.append(sent)
  return sentences, labels

In [None]:
sentences_tr, labels_tr = get_sentences(boundaries_tr, df, False)
sentences_val, labels_val = get_sentences(boundaries_val, df_val, False)
sentences_tst = get_sentences(boundaries_tst, df_test, True)

In [None]:
len(sentences_tr), len(sentences_val), len(sentences_tst[0])

(23983, 2199, 3062)

**Training dataframe**

In [None]:
# data frame with two columns the sentence and labels
tr_df = pd.DataFrame(columns=['sentence', 'labels'], index=[i for i in range(len(sentences_tr))])

In [None]:
# assigning the sentences and labels
tr_df['sentence'] = sentences_tr
tr_df['labels'] = labels_tr

In [None]:
tr_df['sent_len'] = tr_df['sentence'].apply(lambda x: len(x.split()))

In [None]:
tr_df['sent_len'].max()

54

In [None]:
tr_df.head()

Unnamed: 0,sentence,labels,sent_len
0,"Schartau sagte dem "" Tagesspiegel "" vom Freita...","[B-PER, O, O, O, B-ORG, O, O, O, B-PER, O, O, ...",23
1,Firmengründer Wolf Peter Bree arbeitete Anfang...,"[O, B-PER, I-PER, I-PER, O, O, O, O, O, O, O, ...",21
2,Ob sie dabei nach dem Runden Tisch am 23. Apri...,"[O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O,...",23
3,Bayern München ist wieder alleiniger Top- Favo...,"[B-ORG, I-ORG, O, O, O, O, O, O, O, O, O, O, O...",14
4,Dabei hätte der tapfere Schlussmann allen Grun...,"[O, O, O, O, O, O, O, O, O, O, O, O, O]",13


In [None]:
!pip install conllu

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3


In [None]:
import conllu

# Path to your TSV file
tsv_file_path = '/content/test_x.tsv'

# Read the TSV file
with open(tsv_file_path, 'r', encoding='utf-8') as file:
    tsv_content = file.read()

# Parse TSV content using conllu
sentences = conllu.parse(tsv_content)

# Now 'sentences' contains a list of sentences, each represented as a list of words
tot = []
for sentence in sentences:
    words = [word['form'] for word in sentence]
    tot.append(" ".join(words))

In [None]:
print(sentences_tr[0])
print(sentences_val[0])
print(sentences_tst[0][0])

Schartau sagte dem " Tagesspiegel " vom Freitag Fischer sei " in einer Weise aufgetreten die alles andere als überzeugend war " .
Gleich darauf entwirft er seine Selbstdarstellung " Ecce homo " in enger Auseinandersetzung mit diesem Bild Jesu .
Für seine 40. Blutspende wurde WILHELM BECKER aus Großenhausen vom DRK ausgezeichnet .


**Validation dataframe**

In [None]:
val_df = pd.DataFrame(columns=['sentence', 'labels'], index=[i for i in range(len(sentences_val))])

In [None]:
val_df['sentence'] = sentences_val
val_df['labels'] = labels_val

In [None]:
val_df['sent_len'] = val_df['sentence'].apply(lambda x: len(x.split()))

In [None]:
val_df['sent_len'].max()

47

In [None]:
val_df.head()

Unnamed: 0,sentence,labels,sent_len
0,Gleich darauf entwirft er seine Selbstdarstell...,"[B-PER, O, O, O, B-ORG, O, O, O, B-PER, O, O, ...",18
1,1980 kam der Crown als Versuch von Toyota sich...,"[O, O, O, O, O, O, B-PER, I-PER, I-PER, O, O, ...",19
2,– 4:26 # Sometime Ago/La Fiesta – 23:18 Alle S...,"[O, O, O, O, B-LOC, O, O, O, O, O, O, O, O, O,...",23
3,Bis 2013 steigen die Mittel aus dem EU-Budget ...,"[O, O, O, O, O, O, O, B-ORG, I-ORG, O, O, O, O...",14
4,Daraus entwickelte sich im Rokoko die Sitte de...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",21


**Test dataframe**

In [None]:
tst_df = pd.DataFrame(columns=['sentence'], index=[i for i in range(len(sentences_tst[0]))])

In [None]:
tst_df['sentence'] = sentences_tst[0]

In [None]:
tst_df['sent_len'] = tst_df['sentence'].apply(lambda x: len(x.split()))

In [None]:
tst_df['sent_len'].max()

182

In [None]:
tst_df.head()

Unnamed: 0,sentence,sent_len
0,Für seine 40. Blutspende wurde WILHELM BECKER ...,13
1,Die Schonfrist für Frieder Schömezler ist abge...,8
2,Das habe ich auf der Schlußrunde geändert .,8
3,Im Gegenteil :,3
4,Mal sehen inwieweit die Neuzugänge das kompens...,36


In [None]:
del tr_df['sent_len']
del val_df['sent_len']
del tst_df['sent_len']

**Defining the data set**


*   we can neglect the column of 2nd-Tag
*   we should only fine tune the model with the sentences and labels



In [None]:
class  dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
    self.len = len(dataframe)
    self.dataframe = dataframe
    self.tokenizer = tokenizer
    self.max_len = 256

  def __getitem__(self, index):
    # step 1: get the sentence and word labels
    sentence = self.dataframe['sentence'][index]
    word_label = self.dataframe['labels'][index]

    # step 2: use tokenizer to tokenize the sentence including
    encoding = self.tokenizer(sentence.split(), is_split_into_words=True, return_offsets_mapping=True, padding='max_length',truncation=True,max_length=self.max_len)

    # step 3: create the token labels; only for the first Word piece for each tokenized word
    labels = [label2id [label] for label in word_label]

    # create an empty array of -100 with the max_len
    encoded_labels = np.ones(len(encoding['offset_mapping']), dtype=int) * -100

    # set only labels its first mapping == 0 and the second != 0
    i = 0
    for idx, mapping in enumerate(encoding['offset_mapping']):
      if mapping[0] == 0 and mapping[1] != 0:
        try:
          encoded_labels[idx] = labels[i]
          i+=1
        except IndexError:
          continue

    # step 4: Turn everything to pytorch tensors
    item = {key:torch.as_tensor(val) for key, val in encoding.items()}
    item['labels'] = torch.as_tensor(encoded_labels)
    return item

  def __len__(self):
    return self.len

**getting the pretrained model and tokenizer**

In [None]:
model = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model, use_fast=True, num_labels=len(output_labels), add_prefix_space=True)
model = RobertaForTokenClassification.from_pretrained(model, num_labels=len(output_labels))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Configuration**

In [None]:
config = {'max_length': 512,
         'train_batch_size':8,
         'valid_batch_size':8,
         'epochs':5,
         'learning_rate': 2.5e-5,
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

**Training and validation dataset**

In [None]:
train_set = dataset(tr_df, tokenizer, config['max_length'])
valid_set = dataset(val_df, tokenizer, config['max_length'])

**Training and validation parameters**




In [None]:
# train_params
train_params = {
                'batch_size':config['train_batch_size'],
                'shuffle':True,
                'num_workers':1,
                'pin_memory':True
                }

# valid_params
valid_params = {
                'batch_size':config['valid_batch_size'],
                'shuffle':True,
                'num_workers':1,
                'pin_memory':True
                }

**Training data & Validation data loaders**

In [None]:
train_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **valid_params)

**Defining the optimizer**

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rate'])

In [None]:
device = config['device']
model.to(device)

**Training and validation step**

In [None]:
def train(epoch,model, train_loader):
  tr_loss, tr_accuracy = 0, 0
  nb_tr_examples, nb_tr_steps = 0, 0
  tr_preds, tr_labels = [], []

  # iterating over batches in training loader
  for idx, batch in enumerate(train_loader):
    ids = batch['input_ids'].to(device, dtype=torch.long)
    mask = batch['attention_mask'].to(device, dtype=torch.long)
    labels = batch['labels'].to(device, dtype=torch.long) # tensor with shape (N, 1) where n is the number of samples in the batch

    loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
    tr_loss += loss.item()

    nb_tr_steps += 1
    nb_tr_examples += labels.size(0) # number of samples in batch N

    if idx % 100 == 0:
      loss_step = tr_loss/nb_tr_steps
      print(f"Training loss per 100 training steps: {loss_step}")

    # computing training accuracy
    flattend_targets = labels.view(-1) # flatten the tensor
    active_logits = tr_logits.view(-1, model.num_labels)
    flattend_predictions = torch.argmax(active_logits, axis=1) # the position of the highest probability

    # only compute accuracy at active logits
    active_accuracy = labels.view(-1) != -100

    labels = torch.masked_select(flattend_targets, active_accuracy) # only the labels with no -100
    predictions = torch.masked_select(flattend_predictions, active_accuracy)

    tr_labels.extend(labels)
    tr_preds.extend(predictions)

    tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
    tr_accuracy += tmp_tr_accuracy

    # gradient clipping in order to fix the exploding gradient problem.
    # if g > c then c. g/||g|| c: hyperparameter, g is the gradient and ||g|| is the norm of the gradient
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config['max_grad_norm'])

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  epoch_loss = tr_loss/ nb_tr_steps
  tr_accuracy = tr_accuracy/ nb_tr_steps
  print(f'Training Loss epoch: {epoch_loss}, Training Accuracy epoch: {tr_accuracy}')

In [None]:
def valid(epoch, model, val_loader):
  # put the model on evaluation mode
  model.eval()

  val_loss, val_accuracy = 0, 0
  nb_val_examples, nb_val_steps = 0, 0
  val_preds, val_labels = [], []

  with torch.no_grad():
    for idx, batch in enumerate(val_loader):

      ids = batch['input_ids'].to(device, dtype=torch.long)
      mask = batch['attention_mask'].to(device, dtype=torch.long)
      labels = batch['labels'].to(device, dtype=torch.long)

      loss, val_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
      val_loss += loss.item()

      nb_val_steps += 1
      nb_val_examples += labels.size(0)

      if idx % 100 == 0:
        loss_step = val_loss/nb_val_steps
        print(f'Validation loss per 100 validation steps: {loss_step}')

      # comput validation accuracy
      flattend_targets = labels.view(-1)
      active_logits = val_logits.view(-1, model.num_labels)
      flattend_predictions = torch.argmax(active_logits, axis=1)

      # only comput accuracy at active logits
      active_accuracy = labels.view(-1) != -100

      labels = torch.masked_select(flattend_targets, active_accuracy)
      predictions = torch.masked_select(flattend_predictions, active_accuracy)

      val_labels.extend(labels)
      val_preds.extend(predictions)

      tmp_val_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
      val_accuracy += tmp_val_accuracy

    labels = [id2label[id.item()] for id in val_labels]
    predictions = [id2label[id.item()] for id in val_preds]

    val_loss = val_loss/nb_val_steps
    val_accuracy = val_accuracy/nb_val_steps
    print(f'Validation loss epoch: {val_loss}, Validation Accuracy epoch: {val_accuracy}')
    return labels, predictions

In [None]:
def start(epochs, model,tr_loader, val_loader):
  for epoch in range(epochs):
    print(f'Training epoch {epoch+1}\n')
    train(epoch, model,tr_loader)
    labels, preds = valid(epoch, model, val_loader)
    print('========================================================================================')
  return labels, preds

In [None]:
lab, preds = start(3, model, train_loader, valid_loader)

Training epoch 1

Training loss per 100 training steps: 1.6793177127838135
Training loss per 100 training steps: 0.2543856484059355
Training loss per 100 training steps: 0.20552253226902503
Training loss per 100 training steps: 0.18272603262816106
Training loss per 100 training steps: 0.16896432249730178
Training loss per 100 training steps: 0.1600367054148036
Training loss per 100 training steps: 0.150398121137922
Training loss per 100 training steps: 0.1448768305847323
Training loss per 100 training steps: 0.141044151700089
Training loss per 100 training steps: 0.13705008890899467
Training loss per 100 training steps: 0.1339687628529177
Training loss per 100 training steps: 0.13273296788458347
Training loss per 100 training steps: 0.13152766534763943
Training loss per 100 training steps: 0.1292453552624905
Training loss per 100 training steps: 0.127099494534883
Training loss per 100 training steps: 0.12454071596418716
Training loss per 100 training steps: 0.12229977515546475
Training

In [None]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_path = 'roberta-fine-tune-de-ner'
model.save_pretrained(model_path)

In [None]:
from huggingface_hub import HfApi

api = HfApi()
model_repo_name = "MAbokahf/roberta-fine-tune-de-ner"  # Format of Input  <Profile Name > / <Model Repo Name>

#Create Repo in Hugging Face
api.create_repo(repo_id=model_repo_name)

In [None]:

#Upload Model folder from Local to HuggingFace
api.upload_folder(
    folder_path=model_path,
    repo_id=model_repo_name
)

# Publish Model Tokenizer on Hugging Face
tokenizer.push_to_hub(model_repo_name)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MAbokahf/roberta-fine-tune-de-ner/commit/2d37d4cab2bcb0e621d69718f7e0fa45755023d9', commit_message='Upload tokenizer', commit_description='', oid='2d37d4cab2bcb0e621d69718f7e0fa45755023d9', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "MAbokahf/roberta-fine-tune-de-ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [None]:
len(lab), len(preds)

(39906, 39906)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(lab, preds))

              precision    recall  f1-score   support

       B-LOC       0.02      0.02      0.02       765
       B-ORG       0.01      0.01      0.01       486
       B-PER       0.02      0.02      0.02       704
       I-LOC       0.01      0.01      0.01       123
       I-ORG       0.01      0.01      0.01       317
       I-PER       0.01      0.01      0.01       406
           O       0.93      0.93      0.93     37105

    accuracy                           0.86     39906
   macro avg       0.14      0.14      0.14     39906
weighted avg       0.87      0.86      0.86     39906



**Inference for the test data**

In [None]:
# make the model on evaluation mode
model.eval()
model.to(device)

In [None]:
def inference(sentence):
  inputs = tokenizer(sentence.split(),
                     is_split_into_words=True,
                     return_offsets_mapping=True,
                     padding='max_length',
                     truncation=True,
                     max_length=config['max_length'],
                     return_tensors="pt")

  # move to GPU
  ids = inputs['input_ids'].to(device)
  mask = inputs['attention_mask'].to(device)

  # forward pass
  outputs = model(ids, attention_mask=mask, return_dict=False)
  logits = outputs[0]

  active_logits = logits.view(-1, model.num_labels)
  flattend_predictions = torch.argmax(active_logits, axis=1)

  tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist()) #convert_ids_to_tokens(ids.squeeze().tolist()) # decode(ids) instead of convert_ids_to_tokens
  token_predictions = [id2label[i] for i in flattend_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions))
  #print(tokens)



  prediction = []
  out_str = []
  off_list = inputs['offset_mapping'].squeeze().tolist()
  #print(off_list)
  exclude = ['¼', '¶', 'Ļ', '¾', 'į', 'ŀ', 'ł', '»', '½', '¤', '¸', 'Ĩ', '°','²','¨', '¦', '³','¹', 'ħ', '´','Ł','ľ', '¡','ĥ']
  for idx, mapping in enumerate(off_list):
    if mapping[0] == 0 and mapping[1] != 0 and tokens[idx] not in exclude:
      prediction.append(wp_preds[idx][1])
      out_str.append(wp_preds[idx][0])
    else:
      if idx == 1:
        prediction.append(wp_preds[idx][1])
        out_str.append(wp_preds[idx][0])
      continue
  return prediction, out_str

In [None]:
def preprocess(text):
  text = text.replace('ü','ue')
  text = text.replace('ä','ae')
  text = text.replace('ö','oe')
  text = text.replace('„','xx')
  text = text.replace('Ä','Ae')
  text = text.replace('Ö','oe')
  text = text.replace('Ü','Ue')
  text = text.replace('“','xx')
  text = text.replace('⋅','xx')
  return text

In [None]:
tst_df['sentence'] = tst_df['sentence'].apply(preprocess)

In [None]:
y_preds = []
words = []
for i, t in enumerate(tst_df['sentence'].tolist()):
  output, output_str = inference(t)
  y_preds.append(output)
  words.append(output_str)

In [None]:
tst_df['truth_length'] = tst_df['sentence'].apply(lambda x: len(x.split()))
tst_df['preds'] = y_preds
tst_df['pred_length'] = tst_df['preds'].apply(lambda x: len(x))

In [None]:
tst_df[tst_df['pred_length'] != tst_df['truth_length']]

Unnamed: 0,sentence,truth_length,preds,pred_length


In [None]:
tst_df.head()

Unnamed: 0,sentence,truth_length,preds,pred_length
0,Fuer seine 40. Blutspende wurde WILHELM BECKER...,13,"[O, O, O, O, O, B-LOC, I-PER, O, B-LOC, O, B-O...",13
1,Die Schonfrist fuer Frieder Schoemezler ist ab...,8,"[O, O, O, O, O, O, O, O]",8
2,Das habe ich auf der Schlußrunde geaendert .,8,"[O, O, O, O, O, O, O, O]",8
3,Im Gegenteil :,3,"[O, O, O]",3
4,Mal sehen inwieweit die Neuzugaenge das kompen...,36,"[O, O, O, O, O, O, O, O, O, O, B-PER, O, O, O,...",36


In [None]:
tst_df['pred_length'].sum(), len(df_test)

(49063, 49063)

In [None]:
print(len(tst_df['preds'].tolist()))

3062


In [None]:
predictions = [item for sublist in tst_df['preds'].tolist() for item in sublist]

In [None]:
print(len(predictions))

49063


In [None]:
df_test['1st-Tag'] = predictions

In [None]:
df_test.head()

Unnamed: 0,index,Token,1st-Tag
0,1,Für,O
1,2,seine,O
2,3,40.,O
3,4,Blutspende,O
4,5,wurde,O


In [None]:
def write_df_to_tsv(df, filename):
    """
    Writes a DataFrame to a .tsv file.

    Args:
        df (pandas.DataFrame): The DataFrame to be written.
        filename (str): The filename of the output file.
    """
    with open(filename, 'w', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['0']) #need to be modified
        for index, row in df.iterrows():
            writer.writerow([row['index'], row['Token'], row['1st-Tag']])


In [None]:
write_df_to_tsv(df_test, 'preds_1.tsv')

In [None]:
pd.read_csv('/content/preds_1.tsv', on_bad_lines='skip').head(10)

Unnamed: 0,0
0,1\tFür\tO
1,2\tseine\tO
2,3\t40.\tO
3,4\tBlutspende\tO
4,5\twurde\tO
5,6\tWILHELM\tB-LOC
6,7\tBECKER\tI-PER
7,8\taus\tO
8,9\tGroßenhausen\tB-LOC
9,10\tvom\tO
