<a href="https://colab.research.google.com/github/LeonGoergen/informationExtraction/blob/main/data_prep/DataPreparationAdministration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm
import json
import unicodedata
import os
from collections import Counter

# Data Preparation

In [None]:
!unzip -u "/content/drive/MyDrive/Masterarbeit/Public administration/GerPS-NER.zip" -d "/content/data"

Archive:  /content/drive/MyDrive/Masterarbeit/Public administration/GerPS-NER.zip
   creating: /content/data/GerPS-NER/
  inflating: /content/data/GerPS-NER/99109029001000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99109048261000.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99109065058000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110002001000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110003001000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110004007000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110007169000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110010022000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110013061000_c.txtCURATION_USER.tsv_short.conll  
  inflating: /content/data/GerPS-NER/99110040248000_c.txtCURATION_USER.tsv_short.

In [None]:
def read_conll_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentences = []
    sentence = []

    for line in lines:
        if line.strip() == '':
          if sentence:
                sentences.append(sentence)
                sentence = []
        try:
          word, tag = line.strip().split()
          sentence.append((word, tag))
        except:
          continue

    if sentence:
        sentences.append(sentence)

    return sentences

In [None]:
def create_dataframe(filepath, sentence):
    sentences = read_conll_file(filepath)

    rows = []
    for sent in sentences:
        for word, tag in sent:
            rows.append((word, tag, sentence))
        sentence += 1

    df = pd.DataFrame(rows, columns=['Word', 'Tag', 'Sentence'])
    return df, sentence

directory = '/content/data/GerPS-NER'
all_df = pd.DataFrame()
count = 0

for filename in os.listdir(directory):
    if filename.endswith('.conll'):
        file_path = os.path.join(directory, filename)
        df, count = create_dataframe(file_path, count)

        all_df = pd.concat([all_df, df], ignore_index=True)

all_df

Unnamed: 0,Word,Tag,Sentence
0,§,O,0
1,5,O,0
2,GGBefG,O,0
3,-,O,0
4,Einzelnorm,O,0
...,...,...,...
495297,und,O,21753
495298,20,O,21753
495299,gelten,O,21753
495300,entsprechend,O,21753


In [None]:
tag_counts = all_df['Tag'].str.replace(r'(B-|I-)', '', regex=True).value_counts()
tag_counts = tag_counts[tag_counts.index != 'O']
tag_counts

Unnamed: 0_level_0,count
Tag,Unnamed: 1_level_1
Bedingung,68304
Handlungsgrundlage,19026
Aktion,8216
Signalwort,6539
Dokument,4729
Frist,3198
Datenfeld,3105
Ergebnisempfänger,3007
Mitwirkender,2267
Hauptakteur,2126


In [None]:
sentence_tags = all_df[all_df['Sentence'] == 20]['Tag'].tolist()
sentence_words = all_df[all_df['Sentence'] == 20]['Word'].tolist()
print(sentence_tags)
print(sentence_words)
print(' '.join(sentence_words))

['O', 'O', 'B-Signalwort', 'O', 'O', 'B-Hauptakteur', 'B-Aktion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Mitwirkender', 'O', 'B-Aktion', 'B-Signalwort', 'O']
['Die', 'Prüfung', 'muss', 'vor', 'einem', 'Prüfungsausschuss', 'abgelegt', 'werden', ',', 'der', 'bei', 'der', 'für', 'die', 'Finanzverwaltung', 'zuständigen', 'obersten', 'Landesbehörde', 'zu', 'bilden', 'ist', '.']
Die Prüfung muss vor einem Prüfungsausschuss abgelegt werden , der bei der für die Finanzverwaltung zuständigen obersten Landesbehörde zu bilden ist .


In [None]:
def create_sentence_dataframe(df):
  grouped = df.groupby(['Sentence']).agg({
    'Word': list,
    'Tag': list,
  }).reset_index()

  new_df = pd.DataFrame({
      'tokens': grouped['Word'],
      'ner_tags': grouped['Tag'],
      'sentence': grouped['Sentence']
  })

  return new_df

sentence_df = create_sentence_dataframe(all_df)
sentence_df

Unnamed: 0,tokens,ner_tags,sentence
0,"[§, 84, WHG, -, Einzelnorm]","[O, O, O, O, O]",0
1,"[(, 1, ), Maßnahmenprogramme, und, Bewirtschaf...","[O, O, O, B-Dokument, O, B-Dokument, O, B-Bedi...",1
2,"[(, 2, ), Die, im, Maßnahmenprogramm, aufgefüh...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",2
3,"[Neue, oder, im, Rahmen, eines, aktualisierten...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",3
4,"[§, 4, ÖLG, -, Einzelnorm]","[O, O, O, O, O]",4
...,...,...,...
21749,"[Die, Register, führende, Stelle, unterrichtet...","[O, B-Dokument, O, B-Hauptakteur, B-Aktion, O,...",21749
21750,"[1221, /, 2009, über, die, Gründe, für, die, e...","[I-Handlungsgrundlage, I-Handlungsgrundlage, I...",21750
21751,"[(, 4, ), Die, Register, führenden, Stellen, u...","[O, O, O, O, B-Dokument, O, B-Hauptakteur, O, ...",21751
21752,"[(, 5, ), Die, Register, führende, Stelle, set...","[O, O, O, O, B-Dokument, O, B-Hauptakteur, B-A...",21752


In [None]:
train, test = train_test_split(sentence_df, test_size=0.15, random_state=1)
len(train), len(test)

(18490, 3264)

In [None]:
def count_tags(tags):
    tag_counter = Counter()
    for tag in tags:
        if tag.startswith('B-') or tag.startswith('I-'):
            tag = tag[2:]  # Remove 'B-' or 'I-'
        tag_counter[tag] += 1
    return tag_counter

total_counts = Counter()
for tags in train['ner_tags']:
    total_counts.update(count_tags(tags))
sorted(total_counts.items(), key=lambda pair: pair[0])

[('Aktion', 7121),
 ('Bedingung', 57130),
 ('Datenfeld', 2524),
 ('Dokument', 4017),
 ('Ergebnisempfänger', 2585),
 ('Frist', 2763),
 ('Handlungsgrundlage', 16497),
 ('Hauptakteur', 1833),
 ('Mitwirkender', 1988),
 ('O', 319731),
 ('Signalwort', 5822)]

In [None]:
train_json = train.to_json(orient='records')
test_json = test.to_json(orient='records')

with open('/content/drive/MyDrive/Masterarbeit/Public administration/train.json', 'w') as f:
  f.write(train_json)

with open('/content/drive/MyDrive/Masterarbeit/Public administration/test.json', 'w') as f:
  f.write(test_json)