In [7]:
import json
import pandas as pd
import os
from tqdm import tqdm

In [8]:
from sklearn.model_selection import train_test_split

df_list = []
corpus_files = os.listdir("../corpus/")
print(len(corpus_files))
train, test = train_test_split(
        corpus_files, train_size=0.8, shuffle=True, random_state=6948050
    )
print(len(train))
print(len(test))

for file in corpus_files:
    with open(f"../corpus/{file}", "r", encoding="utf8") as f:
        d = json.load(f)
        df = pd.DataFrame(d)
        df = df.drop(columns=["author", "webpath", "title"])
        df["filepath"] = df["filepath"].apply(lambda x: x.replace("/mnt/c/Users/Moritz Lahann/Desktop/STUDIUM/Module IAS/Master's Thesis/gutenberg-edition16/", ""))
        df_list.append(df)

df = pd.concat(df_list)
df.head()


5940
4752
1188


Unnamed: 0,genre,filepath,chapters
0,"Romane, Novellen und Erzählungen",heyse/plaudere/plaudere.html,"{'name': 'Faustrecht', 'idx': 0, 'paragraphs':..."
1,"Romane, Novellen und Erzählungen",heyse/plaudere/plaudere.html,"{'name': 'Das schwächere Geschlecht', 'idx': 1..."
2,"Romane, Novellen und Erzählungen",heyse/plaudere/plaudere.html,"{'name': 'Altruismus', 'idx': 2, 'paragraphs':..."
3,"Romane, Novellen und Erzählungen",heyse/plaudere/plaudere.html,"{'name': 'Don Juan', 'idx': 3, 'paragraphs': [..."
4,"Romane, Novellen und Erzählungen",heyse/plaudere/plaudere.html,"{'name': 'Erste Liebe', 'idx': 4, 'paragraphs'..."


In [9]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")

grouped_df = df.groupby(by=["filepath"])
chapters_df_list = []
books_df_list = []
for name, group in tqdm(grouped_df):
    paragraph_sum, sentence_sum, word_sum = 0
    for chapter in group["chapters"]:
        paragraph_count = len(chapter["paragraphs"])
        word_count = sum([len(word_tokenize(p, language="german")) for p in chapter["paragraphs"]])
        sentence_count = sum([len(sent_tokenize(p, language="german")) for p in chapter["paragraphs"]])
        book = name

        chapters_df_list.append(pd.DataFrame({"paragraph_count": [paragraph_count], "word_count": [word_count], "sentence_count": [sentence_count], "book": [book]}))
    # books_df_list.append(pd.DataFrame({"book": [book], "chapter_count": len(group["chapters"], "paragraph_count": sum([len(p) for p in chapter["paragraphs"] for chapter in group["chapters"]]))}))

stats_df = pd.concat(chapters_df_list)

print(stats_df)

[nltk_data] Downloading package punkt to /home/sylvarus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 4752/4752 [25:50<00:00,  3.07it/s] 


    paragraph_count  word_count  sentence_count                          book
0                 3         129               5    abraham/fabeln/fabeln.html
0                 2         204               7    abraham/fabeln/fabeln.html
0                 2         217               7    abraham/fabeln/fabeln.html
0                 3         150               7    abraham/fabeln/fabeln.html
0                 5         311              10    abraham/fabeln/fabeln.html
..              ...         ...             ...                           ...
0               173       31749            1075  zweig/nove-erz/nove-erz.html
0               143       24769             783  zweig/nove-erz/nove-erz.html
0                55        9646             324  zweig/nove-erz/nove-erz.html
0               219       29252            1178  zweig/nove-erz/nove-erz.html
0                49        2342              99  zweig/nove-erz/nove-erz.html

[117248 rows x 4 columns]


In [10]:
for name, group in stats_df.groupby(by="book"):
    print(name)
    print(len(group))

abraham/fabeln/fabeln.html
5
abraham/fabparab/fabparab.html
55
abraham/misc/misc.html
19
abraham/narrnest/narrnest.html
12
achleitn/bergen/bergen.html
68
achleitn/bezirks1/bezirks1.html
5
achleitn/bezirks2/bezirks2.html
12
achleitn/dobratsc/dobratsc.html
10
achleitn/moor/moor.html
14
adlerrev/japan/japan.html
49
adlersfe/amoenen/amoenen.html
9
adlersfe/margarit/margarit.html
10
adlersfe/weissro1/weissro1.html
2
adolph/frueher/frueher.html
15
adolph/haus37/haus37.html
24
adolph/schacker/schacker.html
9
adolph/toechter/toechter.html
16
aesop/fabeln1/fabeln1.html
50
ahlborn/favorite/favorite.html
33
aho/blut/blut.html
17
aho/einsam/einsam.html
6
aho/geaecht/geaecht.html
9
aimard/mexnaec1/mexnaec1.html
10
aimard/mexnaec2/mexnaec2.html
9
aimard/mexnaec3/mexnaec3.html
10
aimard/mexnaec4/mexnaec4.html
10
aimard/trapper/trapper.html
35
aksakow/familien/familien.html
15
alarcon/venegas/venegas.html
25
albrechs/hoefgesp/hoefgesp.html
2
alejchem/anatewka/anatewka.html
9
alejchem/nahosten/nahosten

In [11]:
import numpy as np

print("no. books", len(stats_df["book"].unique()))
print("no. words", stats_df["word_count"].sum())
print("no. paragraphs", stats_df["paragraph_count"].sum())
print("no. sentences", stats_df["sentence_count"].sum())
print("no. chapters", len(stats_df))
print("avg paragraph count", stats_df["paragraph_count"].mean())
print("median paragraph count", stats_df["paragraph_count"].median())
print("avg word count", stats_df["word_count"].mean())
print("median word count", stats_df["word_count"].median())
print("avg sentence count", stats_df["sentence_count"].mean())
print("median sentence count", stats_df["sentence_count"].median())
print("avg chapter count", np.mean([len(b) for b in stats_df.groupby(by="book").groups.values()]))
print("median chapter count", np.median([len(b) for b in stats_df.groupby(by="book").groups.values()]))
print("no. chapter breaks", sum([len(b) - 1 for b in stats_df.groupby(by="book").groups.values()]))

no. books 4752
no. words 372185887
no. paragraphs 6262787
no. sentences 18541842
no. chapters 117248
avg paragraph count 53.41487274836245
median paragraph count 31.0
avg word count 3174.3474259688865
median word count 2135.0
avg sentence count 158.14207491812226
median sentence count 102.0
avg chapter count 24.673400673400675
median chapter count 17.0
no. chapter breaks 112496


In [15]:
stats_df.head()

Unnamed: 0,paragraph_count,word_count,sentence_count,book
0,3,129,5,abraham/fabeln/fabeln.html
0,2,204,7,abraham/fabeln/fabeln.html
0,2,217,7,abraham/fabeln/fabeln.html
0,3,150,7,abraham/fabeln/fabeln.html
0,5,311,10,abraham/fabeln/fabeln.html


In [14]:
chapter_counts = [len(b) for b in stats_df.groupby(by="book").groups.values()]
paragraph_counts = stats_df["paragraph_count"]
word_counts = stats_df["word_count"]
sentence_counts = stats_df["word_count"]
print(word_counts)

0      129
0      204
0      217
0      150
0      311
     ...  
0    31749
0    24769
0     9646
0    29252
0     2342
Name: word_count, Length: 117248, dtype: int64
