In [1]:
import pandas as pd

In [2]:
def get_columns(file_path):
    with open(file_path) as f:
        column_line = f.readline()
        columns = column_line.split("=")[1]
        columns = [c.strip() for c in columns.split(" ") if c]
        return columns


In [3]:
get_columns("./data/train-devel-test/train/fiction/no-morph/chilcote_12.conllup")

['FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'CONLL:NER']

In [4]:
def load_conllup_file(file_path):
    conllu_columns = get_columns("./data/train-devel-test/train/fiction/no-morph/chilcote_12.conllup")
    df = pd.DataFrame(columns=["sentence_index", "position_number_in_sentence", *conllu_columns])
    with open(file_path, encoding="UTF-8") as f:
        sentence_index = 0
        position_number_in_sentence = 0
        line = f.readline() # skip the first comment line
        line = f.readline()
        while line:
            if line == "\n":
                sentence_index +=  1
                position_number_in_sentence = 0
            else:
                splitted_line = line.split()
                conllu_row_part = {}
                for i in range(len(conllu_columns)):
                    conllu_row_part[f"{conllu_columns[i]}"] = splitted_line[i]
                
                row = {
                    "sentence_index": sentence_index,
                    "position_number_in_sentence": position_number_in_sentence,
                    **conllu_row_part
                }
                row = pd.Series(row)
                df.loc[len(df)] = row
                position_number_in_sentence += 1
            line = f.readline()
        return df

In [5]:
df = load_conllup_file("./data/train-devel-test/train/fiction/no-morph/chilcote_12.conllup")

In [29]:
df.head(5)

Unnamed: 0,sentence_index,position_number_in_sentence,FORM,LEMMA,UPOS,XPOS,FEATS,CONLL:NER
0,0,0,-,_,_,_,_,O
1,0,1,Korán,_,_,_,_,O
2,0,2,ebédelnek,_,_,_,_,O
3,0,3,",",_,_,_,_,O
4,0,4,úgy,_,_,_,_,O


In [7]:
df["CONLL:NER"].value_counts()

O         3637
B-PER       50
B-LOC       21
I-LOC       14
B-MISC       1
Name: CONLL:NER, dtype: int64

In [8]:
import os

In [9]:
train_test_devel_data_path = os.path.join("data", "train-devel-test")
train_test_devel_data_dirs = [os.path.join(train_test_devel_data_path, data_dir) for data_dir in os.listdir(train_test_devel_data_path) if os.path.isdir(os.path.join(train_test_devel_data_path, data_dir))]

In [10]:
train_devel_test_conllup_file_dirs = {}
for d in train_test_devel_data_dirs:
    connlup_file_dirs = [os.path.join(d, genre_dir, "no-morph") for genre_dir in os.listdir(d) if os.path.isdir(os.path.join(d, genre_dir)) and "no-morph" in os.listdir(os.path.join(d, genre_dir))]
    train_devel_test_conllup_file_dirs[os.path.basename(d)] = connlup_file_dirs

In [19]:
import re
connlup_file_pattern = re.compile(".*.conllup") 
def get_connlup_files_in_dir(path_to_dir):
    return [f for f in os.listdir(path_to_dir) if connlup_file_pattern.match(f)]

In [53]:
train_devel_test_conllup_file_dirs

{'devel': ['data\\train-devel-test\\devel\\fiction\\no-morph',
  'data\\train-devel-test\\devel\\legal\\no-morph',
  'data\\train-devel-test\\devel\\news\\no-morph',
  'data\\train-devel-test\\devel\\wikipedia\\no-morph'],
 'test': ['data\\train-devel-test\\test\\fiction\\no-morph',
  'data\\train-devel-test\\test\\legal\\no-morph',
  'data\\train-devel-test\\test\\news\\no-morph',
  'data\\train-devel-test\\test\\wikipedia\\no-morph'],
 'train': ['data\\train-devel-test\\train\\fiction\\no-morph',
  'data\\train-devel-test\\train\\legal\\no-morph',
  'data\\train-devel-test\\train\\news\\no-morph',
  'data\\train-devel-test\\train\\wikipedia\\no-morph']}

In [21]:
get_connlup_files_in_dir(train_devel_test_conllup_file_dirs["devel"][0])

['chilcote_16.conllup',
 'opensubtitles_1.conllup',
 'opensubtitles_10.conllup',
 'opensubtitles_11.conllup',
 'opensubtitles_13.conllup',
 'opensubtitles_14.conllup',
 'opensubtitles_15.conllup',
 'opensubtitles_16.conllup',
 'opensubtitles_17.conllup',
 'opensubtitles_18.conllup',
 'opensubtitles_19.conllup',
 'opensubtitles_2.conllup',
 'tanarur_1.conllup',
 'tokmag_3.conllup']

In [68]:
def load_all_conllup_files_in_dir(path_to_dir, train_test_devel, genre):
    conllup_file_paths = [os.path.join(path_to_dir, cf) for cf in get_connlup_files_in_dir(path_to_dir)]
    combined_df = pd.DataFrame()
    for conllup_file in conllup_file_paths:
        print(conllup_file)
        df = load_conllup_file(conllup_file)
        df["train_test_devel"] = train_test_devel
        df["genre"] = genre
        df["file_path"] = conllup_file
        combined_df = pd.concat([combined_df, df])
    return combined_df


In [63]:
data = load_all_conllup_files_in_dir(train_devel_test_conllup_file_dirs["devel"][0], "devel",train_devel_test_conllup_file_dirs["devel"][0].split(os.path.sep)[-2])

data\train-devel-test\devel\fiction\no-morph\chilcote_16.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_1.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_10.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_11.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_13.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_14.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_15.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_16.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_17.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_18.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_19.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_2.conllup
data\train-devel-test\devel\fiction\no-morph\tanarur_1.conllup
data\train-devel-test\devel\fiction\no-morph\tokmag_3.conllup


In [69]:

word_counts = {}
for data_set in train_devel_test_conllup_file_dirs:
    word_counts[data_set] = {}
    for genre_dir in train_devel_test_conllup_file_dirs[data_set]:
        print(genre_dir)
        genre = genre_dir.split(os.path.sep)[-2]
        df = load_all_conllup_files_in_dir(genre_dir, data_set, genre)
        word_counts[data_set][genre] = {"word_count": len(df), "unique_words": len(df["FORM"].unique())}
print(word_counts)


data\train-devel-test\devel\fiction\no-morph
data\train-devel-test\devel\fiction\no-morph\chilcote_16.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_1.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_10.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_11.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_13.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_14.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_15.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_16.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_17.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_18.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_19.conllup
data\train-devel-test\devel\fiction\no-morph\opensubtitles_2.conllup
data\train-devel-test\devel\fiction\no-morph\tanarur_1.conllup
data\train-devel-test\devel\fiction\no-morph\tokmag_3.conll

data\train-devel-test\train\fiction\no-morph\opensubtitles_73.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_74.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_75.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_76.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_77.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_78.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_79.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_8.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_80.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_81.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_82.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_83.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_84.conllup
data\train-devel-test\train\fiction\no-morph\opensubtitles_85.conllup
data\train-devel-test

data\train-devel-test\train\wikipedia\no-morph\huwiki_200_27.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_28.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_29.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_3.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_30.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_31.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_32.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_33.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_34.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_35.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_36.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_37.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_38.conllup
data\train-devel-test\train\wikipedia\no-morph\huwiki_200_39.conllup
data\train-devel-test\train\wikiped

In [67]:
import json
with open("word_count.json", "w") as f:
    json.dump(word_counts, f )