# Package handling

In [None]:
# Packages required to unpack the compressed corpus file

# ! pip install pyunpack
# ! pip install patool

#### External imports

In [None]:
import os
import shutil
from tqdm.auto import tqdm
from pathlib import Path
import operator as Operator
from pyunpack import Archive

#### Internal imports

In [None]:
import modules.os_utils as os_utils
import modules.corpus_utils as corpus_utils
import modules.posnoise as POSNoise

# Prepare corpus data

#### Define base directory in which all the corpus data/metadata is located

In [None]:
base_dir = os.path.abspath("")

#### Create a "raw corpus" directory and download original corpus file from the figshare portal

In [None]:
raw_corpus_directory = Path(base_dir, "[Raw] Corpus")
raw_corpus_directory.mkdir(exist_ok=True)

url = "https://figshare.com/ndownloader/files/7320866"
raw_corpus_filename = "Corpus_of_German_Language_Fiction.zip"

dest_filename = Path(raw_corpus_directory, raw_corpus_filename)
corpus_utils.download_file(url, dest_filename)

#### Unpack the original corpus file

In [None]:
Archive(dest_filename).extractall(raw_corpus_directory)

#### Move all text files in the sub directory "corpus-of-german-fiction-txt" into parent directory

In [None]:
corpus_subdir = "Corpus of German-Language Fiction"
source_dir = Path(raw_corpus_directory, corpus_subdir, "corpus-of-german-fiction-txt")
text_filepaths = os_utils.list_filepaths(source_dir)

for filename in tqdm(text_filepaths):
    fname = Path(filename).name
    shutil.move(filename, Path(raw_corpus_directory, fname))

#### Remove the directory of the extracted zip file with its leftovers

In [None]:
trash_directory = Path(raw_corpus_directory, corpus_subdir)
shutil.rmtree(str(trash_directory))

# Create the authorship attribution (AA) corpus 

#### Create the aa corpus directory

In [None]:
aa_corpus_directory = Path(base_dir, "[AA] Corpus")
aa_corpus_directory.mkdir(exist_ok=True)

#### Copy all text files from the raw corpus directory to the aa corpus directory

In [None]:
raw_text_filepaths = os_utils.list_filepaths(str(raw_corpus_directory))

for raw_text_filepath in tqdm(raw_text_filepaths):
    fname = Path(raw_text_filepath).name
    dest_filepath = Path(aa_corpus_directory, fname)
    shutil.copy(raw_text_filepath, dest_filepath)

#### Create the aa corpus (in-place)

In [None]:
corpus_utils.create_aa_corpus(aa_corpus_directory)

#### ...and keep $n$ files per author

In [None]:
texts_per_author = 3

In [None]:
os_utils.keep_n_files_in_each_subfolder(aa_corpus_directory, number_of_files_to_keep=texts_per_author)

#### Remove authors for which < $n$ documents are available 

In [None]:
os_utils.delete_subdirs_with_operator_n_files(aa_corpus_directory, texts_per_author, Operator.lt, extension=".txt", verbose=False)

# Pre-process all text files

#### Construct the documents once the pre-processing is finished

In [None]:
filepaths = os_utils.list_filepaths(aa_corpus_directory, include_subdirs=True)    

# Restrict the length of each text to max_total_chars (e.g., 7000 characters ~7 kB)
corpus_utils.construct_documents(filepaths, max_total_chars=7000)

#### In case the pre-processing led to shorter texts, remove these according to a threshold

In [None]:
corpus_utils.delete_files_according_to_length(aa_corpus_directory, min_length=1000, verbose=True)

#### Ensure there are at maximum 2 documents available for each author. Otherwise, remove affected author

In [None]:
os_utils.delete_subdirs_with_operator_n_files(aa_corpus_directory, 3, Operator.lt)

#### Sort texts per author according to maximum time-span and reduce them to 2 documents 

In [None]:
corpus_utils.maximize_time_span_and_remove_inner_documents(aa_corpus_directory)

# Apply POSNoise

#### POSNoise: An Effective Countermeasure Against Topic Biases in Authorship Analysis <br><br>  https://arxiv.org/abs/2005.06605

In [None]:
text_filepaths = os_utils.list_filepaths(aa_corpus_directory, include_subdirs=True)

for text_filepath in tqdm(text_filepaths):
    text = Path(text_filepath).read_text(encoding="utf8")  
    posnoised_txt = POSNoise.posnoise(text, model="de_core_news_lg")
    Path(text_filepath).write_text(posnoised_txt, encoding="utf8")

# Finished